src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2011 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   int consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   int produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "buffer.h"
 292 #include "character.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 310 Lisp_Object Qbig, Qlittle;
 311 Lisp_Object Qcoding_system_history;
 312 Lisp_Object Qvalid_codes;
 313 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 314 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 315 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 316 Lisp_Object QCascii_compatible_p;
 317
 318 Lisp_Object Qcall_process, Qcall_process_region;
 319 Lisp_Object Qstart_process, Qopen_network_stream;
 320 Lisp_Object Qtarget_idx;
 321
 322 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 323 Lisp_Object Qinterrupted, Qinsufficient_memory;
 324
 325 /* If a symbol has this property, evaluate the value to define the
 326    symbol as a coding system.  */
 327 static Lisp_Object Qcoding_system_define_form;
 328
 329 /* Format of end-of-line decided by system.  This is Qunix on
 330    Unix and Mac, Qdos on DOS/Windows.
 331    This has an effect only for external encoding (i.e. for output to
 332    file and process), not for in-buffer or Lisp string encoding.  */
 333 static Lisp_Object system_eol_type;
 334
 335 #ifdef emacs
 336
 337 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 338
 339 /* Coding system emacs-mule and raw-text are for converting only
 340    end-of-line format.  */
 341 Lisp_Object Qemacs_mule, Qraw_text;
 342 Lisp_Object Qutf_8_emacs;
 343
 344 /* Coding-systems are handed between Emacs Lisp programs and C internal
 345    routines by the following three variables.  */
 346 /* Coding system to be used to encode text for terminal display when
 347    terminal coding system is nil.  */
 348 struct coding_system safe_terminal_coding;
 349
 350 #endif /* emacs */
 351
 352 Lisp_Object Qtranslation_table;
 353 Lisp_Object Qtranslation_table_id;
 354 Lisp_Object Qtranslation_table_for_decode;
 355 Lisp_Object Qtranslation_table_for_encode;
 356
 357 /* Two special coding systems.  */
 358 Lisp_Object Vsjis_coding_system;
 359 Lisp_Object Vbig5_coding_system;
 360
 361 /* ISO2022 section */
 362
 363 #define CODING_ISO_INITIAL(coding, reg)                 \
 364   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 365                      coding_attr_iso_initial),          \
 366                reg)))
 367
 368
 369 #define CODING_ISO_REQUEST(coding, charset_id)          \
 370   (((charset_id) <= (coding)->max_charset_id            \
 371     ? ((coding)->safe_charsets[charset_id] != 255       \
 372        ? (coding)->safe_charsets[charset_id]            \
 373        : -1)                                            \
 374     : -1))
 375
 376
 377 #define CODING_ISO_FLAGS(coding)        \
 378   ((coding)->spec.iso_2022.flags)
 379 #define CODING_ISO_DESIGNATION(coding, reg)     \
 380   ((coding)->spec.iso_2022.current_designation[reg])
 381 #define CODING_ISO_INVOCATION(coding, plane)    \
 382   ((coding)->spec.iso_2022.current_invocation[plane])
 383 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 384   ((coding)->spec.iso_2022.single_shifting)
 385 #define CODING_ISO_BOL(coding)  \
 386   ((coding)->spec.iso_2022.bol)
 387 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 388   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 389 #define CODING_ISO_CMP_STATUS(coding)   \
 390   (&(coding)->spec.iso_2022.cmp_status)
 391 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 392   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 393 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 394   ((coding)->spec.iso_2022.embedded_utf_8)
 395
 396 /* Control characters of ISO2022.  */
 397                         /* code */      /* function */
 398 #define ISO_CODE_SO     0x0E            /* shift-out */
 399 #define ISO_CODE_SI     0x0F            /* shift-in */
 400 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 401 #define ISO_CODE_ESC    0x1B            /* escape */
 402 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 403 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 404 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 405
 406 /* All code (1-byte) of ISO2022 is classified into one of the
 407    followings.  */
 408 enum iso_code_class_type
 409   {
 410     ISO_control_0,              /* Control codes in the range
 411                                    0x00..0x1F and 0x7F, except for the
 412                                    following 5 codes.  */
 413     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 414     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 415     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 416     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 417     ISO_control_1,              /* Control codes in the range
 418                                    0x80..0x9F, except for the
 419                                    following 3 codes.  */
 420     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 421     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 422     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 423     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 424     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 425     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 426     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 427   };
 428
 429 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 430     `iso-flags' attribute of an iso2022 coding system.  */
 431
 432 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 433    instead of the correct short-form sequence (e.g. ESC $ A).  */
 434 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 435
 436 /* If set, reset graphic planes and registers at end-of-line to the
 437    initial state.  */
 438 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 439
 440 /* If set, reset graphic planes and registers before any control
 441    characters to the initial state.  */
 442 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 443
 444 /* If set, encode by 7-bit environment.  */
 445 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 446
 447 /* If set, use locking-shift function.  */
 448 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 449
 450 /* If set, use single-shift function.  Overwrite
 451    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 452 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 453
 454 /* If set, use designation escape sequence.  */
 455 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 456
 457 /* If set, produce revision number sequence.  */
 458 #define CODING_ISO_FLAG_REVISION        0x0080
 459
 460 /* If set, produce ISO6429's direction specifying sequence.  */
 461 #define CODING_ISO_FLAG_DIRECTION       0x0100
 462
 463 /* If set, assume designation states are reset at beginning of line on
 464    output.  */
 465 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 466
 467 /* If set, designation sequence should be placed at beginning of line
 468    on output.  */
 469 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 470
 471 /* If set, do not encode unsafe characters on output.  */
 472 #define CODING_ISO_FLAG_SAFE            0x0800
 473
 474 /* If set, extra latin codes (128..159) are accepted as a valid code
 475    on input.  */
 476 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 477
 478 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 479
 480 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 481
 482 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 483
 484 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 485
 486 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 487
 488 /* A character to be produced on output if encoding of the original
 489    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 490 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 491
 492 /* UTF-8 section */
 493 #define CODING_UTF_8_BOM(coding)        \
 494   ((coding)->spec.utf_8_bom)
 495
 496 /* UTF-16 section */
 497 #define CODING_UTF_16_BOM(coding)       \
 498   ((coding)->spec.utf_16.bom)
 499
 500 #define CODING_UTF_16_ENDIAN(coding)    \
 501   ((coding)->spec.utf_16.endian)
 502
 503 #define CODING_UTF_16_SURROGATE(coding) \
 504   ((coding)->spec.utf_16.surrogate)
 505
 506
 507 /* CCL section */
 508 #define CODING_CCL_DECODER(coding)      \
 509   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 510 #define CODING_CCL_ENCODER(coding)      \
 511   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 512 #define CODING_CCL_VALIDS(coding)                                          \
 513   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 514
 515 /* Index for each coding category in `coding_categories' */
 516
 517 enum coding_category
 518   {
 519     coding_category_iso_7,
 520     coding_category_iso_7_tight,
 521     coding_category_iso_8_1,
 522     coding_category_iso_8_2,
 523     coding_category_iso_7_else,
 524     coding_category_iso_8_else,
 525     coding_category_utf_8_auto,
 526     coding_category_utf_8_nosig,
 527     coding_category_utf_8_sig,
 528     coding_category_utf_16_auto,
 529     coding_category_utf_16_be,
 530     coding_category_utf_16_le,
 531     coding_category_utf_16_be_nosig,
 532     coding_category_utf_16_le_nosig,
 533     coding_category_charset,
 534     coding_category_sjis,
 535     coding_category_big5,
 536     coding_category_ccl,
 537     coding_category_emacs_mule,
 538     /* All above are targets of code detection.  */
 539     coding_category_raw_text,
 540     coding_category_undecided,
 541     coding_category_max
 542   };
 543
 544 /* Definitions of flag bits used in detect_coding_XXXX.  */
 545 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 546 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 547 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 548 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 549 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 550 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 551 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 552 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 553 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 554 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 555 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 556 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 557 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 558 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 559 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 560 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 561 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 562 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 563 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 564 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 565
 566 /* This value is returned if detect_coding_mask () find nothing other
 567    than ASCII characters.  */
 568 #define CATEGORY_MASK_ANY               \
 569   (CATEGORY_MASK_ISO_7                  \
 570    | CATEGORY_MASK_ISO_7_TIGHT          \
 571    | CATEGORY_MASK_ISO_8_1              \
 572    | CATEGORY_MASK_ISO_8_2              \
 573    | CATEGORY_MASK_ISO_7_ELSE           \
 574    | CATEGORY_MASK_ISO_8_ELSE           \
 575    | CATEGORY_MASK_UTF_8_AUTO           \
 576    | CATEGORY_MASK_UTF_8_NOSIG          \
 577    | CATEGORY_MASK_UTF_8_SIG            \
 578    | CATEGORY_MASK_UTF_16_AUTO          \
 579    | CATEGORY_MASK_UTF_16_BE            \
 580    | CATEGORY_MASK_UTF_16_LE            \
 581    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 582    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 583    | CATEGORY_MASK_CHARSET              \
 584    | CATEGORY_MASK_SJIS                 \
 585    | CATEGORY_MASK_BIG5                 \
 586    | CATEGORY_MASK_CCL                  \
 587    | CATEGORY_MASK_EMACS_MULE)
 588
 589
 590 #define CATEGORY_MASK_ISO_7BIT \
 591   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 592
 593 #define CATEGORY_MASK_ISO_8BIT \
 594   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 595
 596 #define CATEGORY_MASK_ISO_ELSE \
 597   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 598
 599 #define CATEGORY_MASK_ISO_ESCAPE        \
 600   (CATEGORY_MASK_ISO_7                  \
 601    | CATEGORY_MASK_ISO_7_TIGHT          \
 602    | CATEGORY_MASK_ISO_7_ELSE           \
 603    | CATEGORY_MASK_ISO_8_ELSE)
 604
 605 #define CATEGORY_MASK_ISO       \
 606   (  CATEGORY_MASK_ISO_7BIT     \
 607      | CATEGORY_MASK_ISO_8BIT   \
 608      | CATEGORY_MASK_ISO_ELSE)
 609
 610 #define CATEGORY_MASK_UTF_16            \
 611   (CATEGORY_MASK_UTF_16_AUTO            \
 612    | CATEGORY_MASK_UTF_16_BE            \
 613    | CATEGORY_MASK_UTF_16_LE            \
 614    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 615    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 616
 617 #define CATEGORY_MASK_UTF_8     \
 618   (CATEGORY_MASK_UTF_8_AUTO     \
 619    | CATEGORY_MASK_UTF_8_NOSIG  \
 620    | CATEGORY_MASK_UTF_8_SIG)
 621
 622 /* Table of coding categories (Lisp symbols).  This variable is for
 623    internal use only.  */
 624 static Lisp_Object Vcoding_category_table;
 625
 626 /* Table of coding-categories ordered by priority.  */
 627 static enum coding_category coding_priorities[coding_category_max];
 628
 629 /* Nth element is a coding context for the coding system bound to the
 630    Nth coding category.  */
 631 static struct coding_system coding_categories[coding_category_max];
 632
 633 /*** Commonly used macros and functions ***/
 634
 635 #ifndef min
 636 #define min(a, b) ((a) < (b) ? (a) : (b))
 637 #endif
 638 #ifndef max
 639 #define max(a, b) ((a) > (b) ? (a) : (b))
 640 #endif
 641
 642 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 643   do {                                                  \
 644     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 645     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 646   } while (0)
 647
 648
 649 /* Safely get one byte from the source text pointed by SRC which ends
 650    at SRC_END, and set C to that byte.  If there are not enough bytes
 651    in the source, it jumps to `no_more_source'.  If multibytep is
 652    nonzero, and a multibyte character is found at SRC, set C to the
 653    negative value of the character code.  The caller should declare
 654    and set these variables appropriately in advance:
 655         src, src_end, multibytep */
 656
 657 #define ONE_MORE_BYTE(c)                                \
 658   do {                                                  \
 659     if (src == src_end)                                 \
 660       {                                                 \
 661         if (src_base < src)                             \
 662           record_conversion_result                      \
 663             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 664         goto no_more_source;                            \
 665       }                                                 \
 666     c = *src++;                                         \
 667     if (multibytep && (c & 0x80))                       \
 668       {                                                 \
 669         if ((c & 0xFE) == 0xC0)                         \
 670           c = ((c & 1) << 6) | *src++;                  \
 671         else                                            \
 672           {                                             \
 673             src--;                                      \
 674             c = - string_char (src, &src, NULL);        \
 675             record_conversion_result                    \
 676               (coding, CODING_RESULT_INVALID_SRC);      \
 677           }                                             \
 678       }                                                 \
 679     consumed_chars++;                                   \
 680   } while (0)
 681
 682 /* Safely get two bytes from the source text pointed by SRC which ends
 683    at SRC_END, and set C1 and C2 to those bytes while skipping the
 684    heading multibyte characters.  If there are not enough bytes in the
 685    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 686    a multibyte character is found for C2, set C2 to the negative value
 687    of the character code.  The caller should declare and set these
 688    variables appropriately in advance:
 689         src, src_end, multibytep
 690    It is intended that this macro is used in detect_coding_utf_16.  */
 691
 692 #define TWO_MORE_BYTES(c1, c2)                          \
 693   do {                                                  \
 694     do {                                                \
 695       if (src == src_end)                               \
 696         goto no_more_source;                            \
 697       c1 = *src++;                                      \
 698       if (multibytep && (c1 & 0x80))                    \
 699         {                                               \
 700           if ((c1 & 0xFE) == 0xC0)                      \
 701             c1 = ((c1 & 1) << 6) | *src++;              \
 702           else                                          \
 703             {                                           \
 704               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 705               c1 = -1;                                  \
 706             }                                           \
 707         }                                               \
 708     } while (c1 < 0);                                   \
 709     if (src == src_end)                                 \
 710       goto no_more_source;                              \
 711     c2 = *src++;                                        \
 712     if (multibytep && (c2 & 0x80))                      \
 713       {                                                 \
 714         if ((c2 & 0xFE) == 0xC0)                        \
 715           c2 = ((c2 & 1) << 6) | *src++;                \
 716         else                                            \
 717           c2 = -1;                                      \
 718       }                                                 \
 719   } while (0)
 720
 721
 722 /* Store a byte C in the place pointed by DST and increment DST to the
 723    next free point, and increment PRODUCED_CHARS.  The caller should
 724    assure that C is 0..127, and declare and set the variable `dst'
 725    appropriately in advance.
 726 */
 727
 728
 729 #define EMIT_ONE_ASCII_BYTE(c)  \
 730   do {                          \
 731     produced_chars++;           \
 732     *dst++ = (c);               \
 733   } while (0)
 734
 735
 736 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 737
 738 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 739   do {                                  \
 740     produced_chars += 2;                \
 741     *dst++ = (c1), *dst++ = (c2);       \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 747    nonzero, store in an appropriate multibyte from.  The caller should
 748    declare and set the variables `dst' and `multibytep' appropriately
 749    in advance.  */
 750
 751 #define EMIT_ONE_BYTE(c)                \
 752   do {                                  \
 753     produced_chars++;                   \
 754     if (multibytep)                     \
 755       {                                 \
 756         int ch = (c);                   \
 757         if (ch >= 0x80)                 \
 758           ch = BYTE8_TO_CHAR (ch);      \
 759         CHAR_STRING_ADVANCE (ch, dst);  \
 760       }                                 \
 761     else                                \
 762       *dst++ = (c);                     \
 763   } while (0)
 764
 765
 766 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 767
 768 #define EMIT_TWO_BYTES(c1, c2)          \
 769   do {                                  \
 770     produced_chars += 2;                \
 771     if (multibytep)                     \
 772       {                                 \
 773         int ch;                         \
 774                                         \
 775         ch = (c1);                      \
 776         if (ch >= 0x80)                 \
 777           ch = BYTE8_TO_CHAR (ch);      \
 778         CHAR_STRING_ADVANCE (ch, dst);  \
 779         ch = (c2);                      \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       {                                 \
 786         *dst++ = (c1);                  \
 787         *dst++ = (c2);                  \
 788       }                                 \
 789   } while (0)
 790
 791
 792 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 793   do {                                  \
 794     EMIT_ONE_BYTE (c1);                 \
 795     EMIT_TWO_BYTES (c2, c3);            \
 796   } while (0)
 797
 798
 799 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 800   do {                                          \
 801     EMIT_TWO_BYTES (c1, c2);                    \
 802     EMIT_TWO_BYTES (c3, c4);                    \
 803   } while (0)
 804
 805
 806 /* Prototypes for static functions.  */
 807 static void record_conversion_result (struct coding_system *coding,
 808                                       enum coding_result_code result);
 809 static int detect_coding_utf_8 (struct coding_system *,
 810                                 struct coding_detection_info *info);
 811 static void decode_coding_utf_8 (struct coding_system *);
 812 static int encode_coding_utf_8 (struct coding_system *);
 813
 814 static int detect_coding_utf_16 (struct coding_system *,
 815                                  struct coding_detection_info *info);
 816 static void decode_coding_utf_16 (struct coding_system *);
 817 static int encode_coding_utf_16 (struct coding_system *);
 818
 819 static int detect_coding_iso_2022 (struct coding_system *,
 820                                    struct coding_detection_info *info);
 821 static void decode_coding_iso_2022 (struct coding_system *);
 822 static int encode_coding_iso_2022 (struct coding_system *);
 823
 824 static int detect_coding_emacs_mule (struct coding_system *,
 825                                      struct coding_detection_info *info);
 826 static void decode_coding_emacs_mule (struct coding_system *);
 827 static int encode_coding_emacs_mule (struct coding_system *);
 828
 829 static int detect_coding_sjis (struct coding_system *,
 830                                struct coding_detection_info *info);
 831 static void decode_coding_sjis (struct coding_system *);
 832 static int encode_coding_sjis (struct coding_system *);
 833
 834 static int detect_coding_big5 (struct coding_system *,
 835                                struct coding_detection_info *info);
 836 static void decode_coding_big5 (struct coding_system *);
 837 static int encode_coding_big5 (struct coding_system *);
 838
 839 static int detect_coding_ccl (struct coding_system *,
 840                               struct coding_detection_info *info);
 841 static void decode_coding_ccl (struct coding_system *);
 842 static int encode_coding_ccl (struct coding_system *);
 843
 844 static void decode_coding_raw_text (struct coding_system *);
 845 static int encode_coding_raw_text (struct coding_system *);
 846
 847 static void coding_set_source (struct coding_system *);
 848 static void coding_set_destination (struct coding_system *);
 849 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 850 static void coding_alloc_by_making_gap (struct coding_system *,
 851                                         EMACS_INT, EMACS_INT);
 852 static unsigned char *alloc_destination (struct coding_system *,
 853                                          EMACS_INT, unsigned char *);
 854 static void setup_iso_safe_charsets (Lisp_Object);
 855 static unsigned char *encode_designation_at_bol (struct coding_system *,
 856                                                  int *, int *,
 857                                                  unsigned char *);
 858 static int detect_eol (const unsigned char *,
 859                        EMACS_INT, enum coding_category);
 860 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 861 static void decode_eol (struct coding_system *);
 862 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 863 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 864 static int produce_chars (struct coding_system *, Lisp_Object, int);
 865 static INLINE void produce_charset (struct coding_system *, int *,
 866                                     EMACS_INT);
 867 static void produce_annotation (struct coding_system *, EMACS_INT);
 868 static int decode_coding (struct coding_system *);
 869 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 870                                                   struct coding_system *,
 871                                                   int *, EMACS_INT *);
 872 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 873                                               struct coding_system *,
 874                                               int *, EMACS_INT *);
 875 static void consume_chars (struct coding_system *, Lisp_Object, int);
 876 static int encode_coding (struct coding_system *);
 877 static Lisp_Object make_conversion_work_buffer (int);
 878 static Lisp_Object code_conversion_restore (Lisp_Object);
 879 static INLINE int char_encodable_p (int, Lisp_Object);
 880 static Lisp_Object make_subsidiaries (Lisp_Object);
 881
 882 static void
 883 record_conversion_result (struct coding_system *coding,
 884                           enum coding_result_code result)
 885 {
 886   coding->result = result;
 887   switch (result)
 888     {
 889     case CODING_RESULT_INSUFFICIENT_SRC:
 890       Vlast_code_conversion_error = Qinsufficient_source;
 891       break;
 892     case CODING_RESULT_INCONSISTENT_EOL:
 893       Vlast_code_conversion_error = Qinconsistent_eol;
 894       break;
 895     case CODING_RESULT_INVALID_SRC:
 896       Vlast_code_conversion_error = Qinvalid_source;
 897       break;
 898     case CODING_RESULT_INTERRUPT:
 899       Vlast_code_conversion_error = Qinterrupted;
 900       break;
 901     case CODING_RESULT_INSUFFICIENT_MEM:
 902       Vlast_code_conversion_error = Qinsufficient_memory;
 903       break;
 904     case CODING_RESULT_INSUFFICIENT_DST:
 905       /* Don't record this error in Vlast_code_conversion_error
 906          because it happens just temporarily and is resolved when the
 907          whole conversion is finished.  */
 908       break;
 909     case CODING_RESULT_SUCCESS:
 910       break;
 911     default:
 912       Vlast_code_conversion_error = intern ("Unknown error");
 913     }
 914 }
 915
 916 /* This wrapper macro is used to preserve validity of pointers into
 917    buffer text across calls to decode_char, which could cause
 918    relocation of buffers if it loads a charset map, because loading a
 919    charset map allocates large structures.  */
 920 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 921   do {                                                                       \
 922     charset_map_loaded = 0;                                                  \
 923     c = DECODE_CHAR (charset, code);                                         \
 924     if (charset_map_loaded)                                                  \
 925       {                                                                      \
 926         const unsigned char *orig = coding->source;                          \
 927         EMACS_INT offset;                                                    \
 928                                                                              \
 929         coding_set_source (coding);                                          \
 930         offset = coding->source - orig;                                      \
 931         src += offset;                                                       \
 932         src_base += offset;                                                  \
 933         src_end += offset;                                                   \
 934       }                                                                      \
 935   } while (0)
 936
 937
 938 /* If there are at least BYTES length of room at dst, allocate memory
 939    for coding->destination and update dst and dst_end.  We don't have
 940    to take care of coding->source which will be relocated.  It is
 941    handled by calling coding_set_source in encode_coding.  */
 942
 943 #define ASSURE_DESTINATION(bytes)                               \
 944   do {                                                          \
 945     if (dst + (bytes) >= dst_end)                               \
 946       {                                                         \
 947         int more_bytes = charbuf_end - charbuf + (bytes);       \
 948                                                                 \
 949         dst = alloc_destination (coding, more_bytes, dst);      \
 950         dst_end = coding->destination + coding->dst_bytes;      \
 951       }                                                         \
 952   } while (0)
 953
 954
 955 /* Store multibyte form of the character C in P, and advance P to the
 956    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 957    never calls MAYBE_UNIFY_CHAR.  */
 958
 959 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 960   do {                                          \
 961     if ((c) <= MAX_1_BYTE_CHAR)                 \
 962       *(p)++ = (c);                             \
 963     else if ((c) <= MAX_2_BYTE_CHAR)            \
 964       *(p)++ = (0xC0 | ((c) >> 6)),             \
 965         *(p)++ = (0x80 | ((c) & 0x3F));         \
 966     else if ((c) <= MAX_3_BYTE_CHAR)            \
 967       *(p)++ = (0xE0 | ((c) >> 12)),            \
 968         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 969         *(p)++ = (0x80 | ((c) & 0x3F));         \
 970     else if ((c) <= MAX_4_BYTE_CHAR)            \
 971       *(p)++ = (0xF0 | (c >> 18)),              \
 972         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 973         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 974         *(p)++ = (0x80 | (c & 0x3F));           \
 975     else if ((c) <= MAX_5_BYTE_CHAR)            \
 976       *(p)++ = 0xF8,                            \
 977         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 978         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 979         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 980         *(p)++ = (0x80 | (c & 0x3F));           \
 981     else                                        \
 982       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 983   } while (0)
 984
 985
 986 /* Return the character code of character whose multibyte form is at
 987    P, and advance P to the end of the multibyte form.  This is like
 988    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 989
 990 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 991   (!((p)[0] & 0x80)                                             \
 992    ? *(p)++                                                     \
 993    : ! ((p)[0] & 0x20)                                          \
 994    ? ((p) += 2,                                                 \
 995       ((((p)[-2] & 0x1F) << 6)                                  \
 996        | ((p)[-1] & 0x3F)                                       \
 997        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
 998    : ! ((p)[0] & 0x10)                                          \
 999    ? ((p) += 3,                                                 \
1000       ((((p)[-3] & 0x0F) << 12)                                 \
1001        | (((p)[-2] & 0x3F) << 6)                                \
1002        | ((p)[-1] & 0x3F)))                                     \
1003    : ! ((p)[0] & 0x08)                                          \
1004    ? ((p) += 4,                                                 \
1005       ((((p)[-4] & 0xF) << 18)                                  \
1006        | (((p)[-3] & 0x3F) << 12)                               \
1007        | (((p)[-2] & 0x3F) << 6)                                \
1008        | ((p)[-1] & 0x3F)))                                     \
1009    : ((p) += 5,                                                 \
1010       ((((p)[-4] & 0x3F) << 18)                                 \
1011        | (((p)[-3] & 0x3F) << 12)                               \
1012        | (((p)[-2] & 0x3F) << 6)                                \
1013        | ((p)[-1] & 0x3F))))
1014
1015
1016 static void
1017 coding_set_source (struct coding_system *coding)
1018 {
1019   if (BUFFERP (coding->src_object))
1020     {
1021       struct buffer *buf = XBUFFER (coding->src_object);
1022
1023       if (coding->src_pos < 0)
1024         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1025       else
1026         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1027     }
1028   else if (STRINGP (coding->src_object))
1029     {
1030       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1031     }
1032   else
1033     {
1034       /* Otherwise, the source is C string and is never relocated
1035          automatically.  Thus we don't have to update anything.  */
1036     }
1037 }
1038
1039 static void
1040 coding_set_destination (struct coding_system *coding)
1041 {
1042   if (BUFFERP (coding->dst_object))
1043     {
1044       if (coding->src_pos < 0)
1045         {
1046           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1047           coding->dst_bytes = (GAP_END_ADDR
1048                                - (coding->src_bytes - coding->consumed)
1049                                - coding->destination);
1050         }
1051       else
1052         {
1053           /* We are sure that coding->dst_pos_byte is before the gap
1054              of the buffer. */
1055           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1056                                  + coding->dst_pos_byte - BEG_BYTE);
1057           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1058                                - coding->destination);
1059         }
1060     }
1061   else
1062     {
1063       /* Otherwise, the destination is C string and is never relocated
1064          automatically.  Thus we don't have to update anything.  */
1065     }
1066 }
1067
1068
1069 static void
1070 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1071 {
1072   coding->destination = (unsigned char *) xrealloc (coding->destination,
1073                                                     coding->dst_bytes + bytes);
1074   coding->dst_bytes += bytes;
1075 }
1076
1077 static void
1078 coding_alloc_by_making_gap (struct coding_system *coding,
1079                             EMACS_INT gap_head_used, EMACS_INT bytes)
1080 {
1081   if (EQ (coding->src_object, coding->dst_object))
1082     {
1083       /* The gap may contain the produced data at the head and not-yet
1084          consumed data at the tail.  To preserve those data, we at
1085          first make the gap size to zero, then increase the gap
1086          size.  */
1087       EMACS_INT add = GAP_SIZE;
1088
1089       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1090       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1091       make_gap (bytes);
1092       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1093       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1094     }
1095   else
1096     {
1097       Lisp_Object this_buffer;
1098
1099       this_buffer = Fcurrent_buffer ();
1100       set_buffer_internal (XBUFFER (coding->dst_object));
1101       make_gap (bytes);
1102       set_buffer_internal (XBUFFER (this_buffer));
1103     }
1104 }
1105
1106
1107 static unsigned char *
1108 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1109                    unsigned char *dst)
1110 {
1111   EMACS_INT offset = dst - coding->destination;
1112
1113   if (BUFFERP (coding->dst_object))
1114     {
1115       struct buffer *buf = XBUFFER (coding->dst_object);
1116
1117       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1118     }
1119   else
1120     coding_alloc_by_realloc (coding, nbytes);
1121   coding_set_destination (coding);
1122   dst = coding->destination + offset;
1123   return dst;
1124 }
1125
1126 /** Macros for annotations.  */
1127
1128 /* An annotation data is stored in the array coding->charbuf in this
1129    format:
1130      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1131    LENGTH is the number of elements in the annotation.
1132    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1133    NCHARS is the number of characters in the text annotated.
1134
1135    The format of the following elements depend on ANNOTATION_MASK.
1136
1137    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1138    follows:
1139      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1140
1141    NBYTES is the number of bytes specified in the header part of
1142    old-style emacs-mule encoding, or 0 for the other kind of
1143    composition.
1144
1145    METHOD is one of enum composition_method.
1146
1147    Optional COMPOSITION-COMPONENTS are characters and composition
1148    rules.
1149
1150    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1151    follows.
1152
1153    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1154    recover from an invalid annotation, and should be skipped by
1155    produce_annotation.  */
1156
1157 /* Maximum length of the header of annotation data.  */
1158 #define MAX_ANNOTATION_LENGTH 5
1159
1160 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1161   do {                                                  \
1162     *(buf)++ = -(len);                                  \
1163     *(buf)++ = (mask);                                  \
1164     *(buf)++ = (nchars);                                \
1165     coding->annotated = 1;                              \
1166   } while (0);
1167
1168 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1169   do {                                                                      \
1170     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1171     *buf++ = nbytes;                                                        \
1172     *buf++ = method;                                                        \
1173   } while (0)
1174
1175
1176 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1177   do {                                                                  \
1178     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1179     *buf++ = id;                                                        \
1180   } while (0)
1181
1182 \f
1183 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1184
1185
1186
1187 \f
1188 /*** 3. UTF-8 ***/
1189
1190 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1191    Check if a text is encoded in UTF-8.  If it is, return 1, else
1192    return 0.  */
1193
1194 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1195 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1196 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1197 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1198 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1199 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1200
1201 #define UTF_8_BOM_1 0xEF
1202 #define UTF_8_BOM_2 0xBB
1203 #define UTF_8_BOM_3 0xBF
1204
1205 static int
1206 detect_coding_utf_8 (struct coding_system *coding,
1207                      struct coding_detection_info *detect_info)
1208 {
1209   const unsigned char *src = coding->source, *src_base;
1210   const unsigned char *src_end = coding->source + coding->src_bytes;
1211   int multibytep = coding->src_multibyte;
1212   int consumed_chars = 0;
1213   int bom_found = 0;
1214   int found = 0;
1215
1216   detect_info->checked |= CATEGORY_MASK_UTF_8;
1217   /* A coding system of this category is always ASCII compatible.  */
1218   src += coding->head_ascii;
1219
1220   while (1)
1221     {
1222       int c, c1, c2, c3, c4;
1223
1224       src_base = src;
1225       ONE_MORE_BYTE (c);
1226       if (c < 0 || UTF_8_1_OCTET_P (c))
1227         continue;
1228       ONE_MORE_BYTE (c1);
1229       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1230         break;
1231       if (UTF_8_2_OCTET_LEADING_P (c))
1232         {
1233           found = 1;
1234           continue;
1235         }
1236       ONE_MORE_BYTE (c2);
1237       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1238         break;
1239       if (UTF_8_3_OCTET_LEADING_P (c))
1240         {
1241           found = 1;
1242           if (src_base == coding->source
1243               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1244             bom_found = 1;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c3);
1248       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1249         break;
1250       if (UTF_8_4_OCTET_LEADING_P (c))
1251         {
1252           found = 1;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c4);
1256       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1257         break;
1258       if (UTF_8_5_OCTET_LEADING_P (c))
1259         {
1260           found = 1;
1261           continue;
1262         }
1263       break;
1264     }
1265   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1266   return 0;
1267
1268  no_more_source:
1269   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1270     {
1271       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1272       return 0;
1273     }
1274   if (bom_found)
1275     {
1276       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1277       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1278     }
1279   else
1280     {
1281       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1282       if (found)
1283         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1284     }
1285   return 1;
1286 }
1287
1288
1289 static void
1290 decode_coding_utf_8 (struct coding_system *coding)
1291 {
1292   const unsigned char *src = coding->source + coding->consumed;
1293   const unsigned char *src_end = coding->source + coding->src_bytes;
1294   const unsigned char *src_base;
1295   int *charbuf = coding->charbuf + coding->charbuf_used;
1296   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1297   int consumed_chars = 0, consumed_chars_base = 0;
1298   int multibytep = coding->src_multibyte;
1299   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1300   Lisp_Object attr, charset_list;
1301   int eol_dos =
1302     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1303   int byte_after_cr = -1;
1304
1305   CODING_GET_INFO (coding, attr, charset_list);
1306
1307   if (bom != utf_without_bom)
1308     {
1309       int c1, c2, c3;
1310
1311       src_base = src;
1312       ONE_MORE_BYTE (c1);
1313       if (! UTF_8_3_OCTET_LEADING_P (c1))
1314         src = src_base;
1315       else
1316         {
1317           ONE_MORE_BYTE (c2);
1318           if (! UTF_8_EXTRA_OCTET_P (c2))
1319             src = src_base;
1320           else
1321             {
1322               ONE_MORE_BYTE (c3);
1323               if (! UTF_8_EXTRA_OCTET_P (c3))
1324                 src = src_base;
1325               else
1326                 {
1327                   if ((c1 != UTF_8_BOM_1)
1328                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1329                     src = src_base;
1330                   else
1331                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1332                 }
1333             }
1334         }
1335     }
1336   CODING_UTF_8_BOM (coding) = utf_without_bom;
1337
1338   while (1)
1339     {
1340       int c, c1, c2, c3, c4, c5;
1341
1342       src_base = src;
1343       consumed_chars_base = consumed_chars;
1344
1345       if (charbuf >= charbuf_end)
1346         {
1347           if (byte_after_cr >= 0)
1348             src_base--;
1349           break;
1350         }
1351
1352       if (byte_after_cr >= 0)
1353         c1 = byte_after_cr, byte_after_cr = -1;
1354       else
1355         ONE_MORE_BYTE (c1);
1356       if (c1 < 0)
1357         {
1358           c = - c1;
1359         }
1360       else if (UTF_8_1_OCTET_P (c1))
1361         {
1362           if (eol_dos && c1 == '\r')
1363             ONE_MORE_BYTE (byte_after_cr);
1364           c = c1;
1365         }
1366       else
1367         {
1368           ONE_MORE_BYTE (c2);
1369           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1370             goto invalid_code;
1371           if (UTF_8_2_OCTET_LEADING_P (c1))
1372             {
1373               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1374               /* Reject overlong sequences here and below.  Encoders
1375                  producing them are incorrect, they can be misleading,
1376                  and they mess up read/write invariance.  */
1377               if (c < 128)
1378                 goto invalid_code;
1379             }
1380           else
1381             {
1382               ONE_MORE_BYTE (c3);
1383               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1384                 goto invalid_code;
1385               if (UTF_8_3_OCTET_LEADING_P (c1))
1386                 {
1387                   c = (((c1 & 0xF) << 12)
1388                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1389                   if (c < 0x800
1390                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1391                     goto invalid_code;
1392                 }
1393               else
1394                 {
1395                   ONE_MORE_BYTE (c4);
1396                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1397                     goto invalid_code;
1398                   if (UTF_8_4_OCTET_LEADING_P (c1))
1399                     {
1400                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1401                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1402                     if (c < 0x10000)
1403                       goto invalid_code;
1404                     }
1405                   else
1406                     {
1407                       ONE_MORE_BYTE (c5);
1408                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1409                         goto invalid_code;
1410                       if (UTF_8_5_OCTET_LEADING_P (c1))
1411                         {
1412                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1413                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1414                                | (c5 & 0x3F));
1415                           if ((c > MAX_CHAR) || (c < 0x200000))
1416                             goto invalid_code;
1417                         }
1418                       else
1419                         goto invalid_code;
1420                     }
1421                 }
1422             }
1423         }
1424
1425       *charbuf++ = c;
1426       continue;
1427
1428     invalid_code:
1429       src = src_base;
1430       consumed_chars = consumed_chars_base;
1431       ONE_MORE_BYTE (c);
1432       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1433       coding->errors++;
1434     }
1435
1436  no_more_source:
1437   coding->consumed_char += consumed_chars_base;
1438   coding->consumed = src_base - coding->source;
1439   coding->charbuf_used = charbuf - coding->charbuf;
1440 }
1441
1442
1443 static int
1444 encode_coding_utf_8 (struct coding_system *coding)
1445 {
1446   int multibytep = coding->dst_multibyte;
1447   int *charbuf = coding->charbuf;
1448   int *charbuf_end = charbuf + coding->charbuf_used;
1449   unsigned char *dst = coding->destination + coding->produced;
1450   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1451   int produced_chars = 0;
1452   int c;
1453
1454   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1455     {
1456       ASSURE_DESTINATION (3);
1457       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1458       CODING_UTF_8_BOM (coding) = utf_without_bom;
1459     }
1460
1461   if (multibytep)
1462     {
1463       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1464
1465       while (charbuf < charbuf_end)
1466         {
1467           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1468
1469           ASSURE_DESTINATION (safe_room);
1470           c = *charbuf++;
1471           if (CHAR_BYTE8_P (c))
1472             {
1473               c = CHAR_TO_BYTE8 (c);
1474               EMIT_ONE_BYTE (c);
1475             }
1476           else
1477             {
1478               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1479               for (p = str; p < pend; p++)
1480                 EMIT_ONE_BYTE (*p);
1481             }
1482         }
1483     }
1484   else
1485     {
1486       int safe_room = MAX_MULTIBYTE_LENGTH;
1487
1488       while (charbuf < charbuf_end)
1489         {
1490           ASSURE_DESTINATION (safe_room);
1491           c = *charbuf++;
1492           if (CHAR_BYTE8_P (c))
1493             *dst++ = CHAR_TO_BYTE8 (c);
1494           else
1495             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1496           produced_chars++;
1497         }
1498     }
1499   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1500   coding->produced_char += produced_chars;
1501   coding->produced = dst - coding->destination;
1502   return 0;
1503 }
1504
1505
1506 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1507    Check if a text is encoded in one of UTF-16 based coding systems.
1508    If it is, return 1, else return 0.  */
1509
1510 #define UTF_16_HIGH_SURROGATE_P(val) \
1511   (((val) & 0xFC00) == 0xD800)
1512
1513 #define UTF_16_LOW_SURROGATE_P(val) \
1514   (((val) & 0xFC00) == 0xDC00)
1515
1516
1517 static int
1518 detect_coding_utf_16 (struct coding_system *coding,
1519                       struct coding_detection_info *detect_info)
1520 {
1521   const unsigned char *src = coding->source;
1522   const unsigned char *src_end = coding->source + coding->src_bytes;
1523   int multibytep = coding->src_multibyte;
1524   int c1, c2;
1525
1526   detect_info->checked |= CATEGORY_MASK_UTF_16;
1527   if (coding->mode & CODING_MODE_LAST_BLOCK
1528       && (coding->src_chars & 1))
1529     {
1530       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1531       return 0;
1532     }
1533
1534   TWO_MORE_BYTES (c1, c2);
1535   if ((c1 == 0xFF) && (c2 == 0xFE))
1536     {
1537       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1538                              | CATEGORY_MASK_UTF_16_AUTO);
1539       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1540                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1541                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1542     }
1543   else if ((c1 == 0xFE) && (c2 == 0xFF))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if (c2 < 0)
1552     {
1553       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1554       return 0;
1555     }
1556   else
1557     {
1558       /* We check the dispersion of Eth and Oth bytes where E is even and
1559          O is odd.  If both are high, we assume binary data.*/
1560       unsigned char e[256], o[256];
1561       unsigned e_num = 1, o_num = 1;
1562
1563       memset (e, 0, 256);
1564       memset (o, 0, 256);
1565       e[c1] = 1;
1566       o[c2] = 1;
1567
1568       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1569                                 |CATEGORY_MASK_UTF_16_BE
1570                                 | CATEGORY_MASK_UTF_16_LE);
1571
1572       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1573              != CATEGORY_MASK_UTF_16)
1574         {
1575           TWO_MORE_BYTES (c1, c2);
1576           if (c2 < 0)
1577             break;
1578           if (! e[c1])
1579             {
1580               e[c1] = 1;
1581               e_num++;
1582               if (e_num >= 128)
1583                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1584             }
1585           if (! o[c2])
1586             {
1587               o[c2] = 1;
1588               o_num++;
1589               if (o_num >= 128)
1590                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1591             }
1592         }
1593       return 0;
1594     }
1595
1596  no_more_source:
1597   return 1;
1598 }
1599
1600 static void
1601 decode_coding_utf_16 (struct coding_system *coding)
1602 {
1603   const unsigned char *src = coding->source + coding->consumed;
1604   const unsigned char *src_end = coding->source + coding->src_bytes;
1605   const unsigned char *src_base;
1606   int *charbuf = coding->charbuf + coding->charbuf_used;
1607   /* We may produces at most 3 chars in one loop.  */
1608   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1609   int consumed_chars = 0, consumed_chars_base = 0;
1610   int multibytep = coding->src_multibyte;
1611   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1612   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1613   int surrogate = CODING_UTF_16_SURROGATE (coding);
1614   Lisp_Object attr, charset_list;
1615   int eol_dos =
1616     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1617   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1618
1619   CODING_GET_INFO (coding, attr, charset_list);
1620
1621   if (bom == utf_with_bom)
1622     {
1623       int c, c1, c2;
1624
1625       src_base = src;
1626       ONE_MORE_BYTE (c1);
1627       ONE_MORE_BYTE (c2);
1628       c = (c1 << 8) | c2;
1629
1630       if (endian == utf_16_big_endian
1631           ? c != 0xFEFF : c != 0xFFFE)
1632         {
1633           /* The first two bytes are not BOM.  Treat them as bytes
1634              for a normal character.  */
1635           src = src_base;
1636           coding->errors++;
1637         }
1638       CODING_UTF_16_BOM (coding) = utf_without_bom;
1639     }
1640   else if (bom == utf_detect_bom)
1641     {
1642       /* We have already tried to detect BOM and failed in
1643          detect_coding.  */
1644       CODING_UTF_16_BOM (coding) = utf_without_bom;
1645     }
1646
1647   while (1)
1648     {
1649       int c, c1, c2;
1650
1651       src_base = src;
1652       consumed_chars_base = consumed_chars;
1653
1654       if (charbuf >= charbuf_end)
1655         {
1656           if (byte_after_cr1 >= 0)
1657             src_base -= 2;
1658           break;
1659         }
1660
1661       if (byte_after_cr1 >= 0)
1662         c1 = byte_after_cr1, byte_after_cr1 = -1;
1663       else
1664         ONE_MORE_BYTE (c1);
1665       if (c1 < 0)
1666         {
1667           *charbuf++ = -c1;
1668           continue;
1669         }
1670       if (byte_after_cr2 >= 0)
1671         c2 = byte_after_cr2, byte_after_cr2 = -1;
1672       else
1673         ONE_MORE_BYTE (c2);
1674       if (c2 < 0)
1675         {
1676           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1677           *charbuf++ = -c2;
1678           continue;
1679         }
1680       c = (endian == utf_16_big_endian
1681            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1682
1683       if (surrogate)
1684         {
1685           if (! UTF_16_LOW_SURROGATE_P (c))
1686             {
1687               if (endian == utf_16_big_endian)
1688                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1689               else
1690                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1691               *charbuf++ = c1;
1692               *charbuf++ = c2;
1693               coding->errors++;
1694               if (UTF_16_HIGH_SURROGATE_P (c))
1695                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1696               else
1697                 *charbuf++ = c;
1698             }
1699           else
1700             {
1701               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1702               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1703               *charbuf++ = 0x10000 + c;
1704             }
1705         }
1706       else
1707         {
1708           if (UTF_16_HIGH_SURROGATE_P (c))
1709             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1710           else
1711             {
1712               if (eol_dos && c == '\r')
1713                 {
1714                   ONE_MORE_BYTE (byte_after_cr1);
1715                   ONE_MORE_BYTE (byte_after_cr2);
1716                 }
1717               *charbuf++ = c;
1718             }
1719         }
1720     }
1721
1722  no_more_source:
1723   coding->consumed_char += consumed_chars_base;
1724   coding->consumed = src_base - coding->source;
1725   coding->charbuf_used = charbuf - coding->charbuf;
1726 }
1727
1728 static int
1729 encode_coding_utf_16 (struct coding_system *coding)
1730 {
1731   int multibytep = coding->dst_multibyte;
1732   int *charbuf = coding->charbuf;
1733   int *charbuf_end = charbuf + coding->charbuf_used;
1734   unsigned char *dst = coding->destination + coding->produced;
1735   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1736   int safe_room = 8;
1737   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1738   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1739   int produced_chars = 0;
1740   Lisp_Object attrs, charset_list;
1741   int c;
1742
1743   CODING_GET_INFO (coding, attrs, charset_list);
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1867    else return 0.  */
1868
1869 static int
1870 detect_coding_emacs_mule (struct coding_system *coding,
1871                           struct coding_detection_info *detect_info)
1872 {
1873   const unsigned char *src = coding->source, *src_base;
1874   const unsigned char *src_end = coding->source + coding->src_bytes;
1875   int multibytep = coding->src_multibyte;
1876   int consumed_chars = 0;
1877   int c;
1878   int found = 0;
1879
1880   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1881   /* A coding system of this category is always ASCII compatible.  */
1882   src += coding->head_ascii;
1883
1884   while (1)
1885     {
1886       src_base = src;
1887       ONE_MORE_BYTE (c);
1888       if (c < 0)
1889         continue;
1890       if (c == 0x80)
1891         {
1892           /* Perhaps the start of composite character.  We simply skip
1893              it because analyzing it is too heavy for detecting.  But,
1894              at least, we check that the composite character
1895              constitutes of more than 4 bytes.  */
1896           const unsigned char *src_start;
1897
1898         repeat:
1899           src_start = src;
1900           do
1901             {
1902               ONE_MORE_BYTE (c);
1903             }
1904           while (c >= 0xA0);
1905
1906           if (src - src_start <= 4)
1907             break;
1908           found = CATEGORY_MASK_EMACS_MULE;
1909           if (c == 0x80)
1910             goto repeat;
1911         }
1912
1913       if (c < 0x80)
1914         {
1915           if (c < 0x20
1916               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1917             break;
1918         }
1919       else
1920         {
1921           int more_bytes = emacs_mule_bytes[c] - 1;
1922
1923           while (more_bytes > 0)
1924             {
1925               ONE_MORE_BYTE (c);
1926               if (c < 0xA0)
1927                 {
1928                   src--;        /* Unread the last byte.  */
1929                   break;
1930                 }
1931               more_bytes--;
1932             }
1933           if (more_bytes != 0)
1934             break;
1935           found = CATEGORY_MASK_EMACS_MULE;
1936         }
1937     }
1938   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1939   return 0;
1940
1941  no_more_source:
1942   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1943     {
1944       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1945       return 0;
1946     }
1947   detect_info->found |= found;
1948   return 1;
1949 }
1950
1951
1952 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1953    character.  If CMP_STATUS indicates that we must expect MSEQ or
1954    RULE described above, decode it and return the negative value of
1955    the decoded character or rule.  If an invalid byte is found, return
1956    -1.  If SRC is too short, return -2.  */
1957
1958 static int
1959 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1960                  int *nbytes, int *nchars, int *id,
1961                  struct composition_status *cmp_status)
1962 {
1963   const unsigned char *src_end = coding->source + coding->src_bytes;
1964   const unsigned char *src_base = src;
1965   int multibytep = coding->src_multibyte;
1966   int charset_ID;
1967   unsigned code;
1968   int c;
1969   int consumed_chars = 0;
1970   int mseq_found = 0;
1971
1972   ONE_MORE_BYTE (c);
1973   if (c < 0)
1974     {
1975       c = -c;
1976       charset_ID = emacs_mule_charset[0];
1977     }
1978   else
1979     {
1980       if (c >= 0xA0)
1981         {
1982           if (cmp_status->state != COMPOSING_NO
1983               && cmp_status->old_form)
1984             {
1985               if (cmp_status->state == COMPOSING_CHAR)
1986                 {
1987                   if (c == 0xA0)
1988                     {
1989                       ONE_MORE_BYTE (c);
1990                       c -= 0x80;
1991                       if (c < 0)
1992                         goto invalid_code;
1993                     }
1994                   else
1995                     c -= 0x20;
1996                   mseq_found = 1;
1997                 }
1998               else
1999                 {
2000                   *nbytes = src - src_base;
2001                   *nchars = consumed_chars;
2002                   return -c;
2003                 }
2004             }
2005           else
2006             goto invalid_code;
2007         }
2008
2009       switch (emacs_mule_bytes[c])
2010         {
2011         case 2:
2012           if ((charset_ID = emacs_mule_charset[c]) < 0)
2013             goto invalid_code;
2014           ONE_MORE_BYTE (c);
2015           if (c < 0xA0)
2016             goto invalid_code;
2017           code = c & 0x7F;
2018           break;
2019
2020         case 3:
2021           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2022               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2023             {
2024               ONE_MORE_BYTE (c);
2025               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2026                 goto invalid_code;
2027               ONE_MORE_BYTE (c);
2028               if (c < 0xA0)
2029                 goto invalid_code;
2030               code = c & 0x7F;
2031             }
2032           else
2033             {
2034               if ((charset_ID = emacs_mule_charset[c]) < 0)
2035                 goto invalid_code;
2036               ONE_MORE_BYTE (c);
2037               if (c < 0xA0)
2038                 goto invalid_code;
2039               code = (c & 0x7F) << 8;
2040               ONE_MORE_BYTE (c);
2041               if (c < 0xA0)
2042                 goto invalid_code;
2043               code |= c & 0x7F;
2044             }
2045           break;
2046
2047         case 4:
2048           ONE_MORE_BYTE (c);
2049           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2050             goto invalid_code;
2051           ONE_MORE_BYTE (c);
2052           if (c < 0xA0)
2053             goto invalid_code;
2054           code = (c & 0x7F) << 8;
2055           ONE_MORE_BYTE (c);
2056           if (c < 0xA0)
2057             goto invalid_code;
2058           code |= c & 0x7F;
2059           break;
2060
2061         case 1:
2062           code = c;
2063           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2064           break;
2065
2066         default:
2067           abort ();
2068         }
2069       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2070                           CHARSET_FROM_ID (charset_ID), code, c);
2071       if (c < 0)
2072         goto invalid_code;
2073     }
2074   *nbytes = src - src_base;
2075   *nchars = consumed_chars;
2076   if (id)
2077     *id = charset_ID;
2078   return (mseq_found ? -c : c);
2079
2080  no_more_source:
2081   return -2;
2082
2083  invalid_code:
2084   return -1;
2085 }
2086
2087
2088 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2089
2090 /* Handle these composition sequence ('|': the end of header elements,
2091    BYTES and CHARS >= 0xA0):
2092
2093    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2094    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2095    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2096
2097    and these old form:
2098
2099    (4) relative composition: 0x80 | MSEQ ... MSEQ
2100    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2101
2102    When the starter 0x80 and the following header elements are found,
2103    this annotation header is produced.
2104
2105         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2106
2107    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2109
2110    Then, upon reading the following elements, these codes are produced
2111    until the composition end is found:
2112
2113    (1) CHAR ... CHAR
2114    (2) ALT ... ALT CHAR ... CHAR
2115    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2116    (4) CHAR ... CHAR
2117    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2118
2119    When the composition end is found, LENGTH and NCHARS in the
2120    annotation header is updated as below:
2121
2122    (1) LENGTH: unchanged, NCHARS: unchanged
2123    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2125    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2126    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2127
2128    If an error is found while composing, the annotation header is
2129    changed to the original composition header (plus filler -1s) as
2130    below:
2131
2132    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2133    (5)          [ 0x80 0xFF -1 -1- -1 ]
2134
2135    and the sequence [ -2 DECODED-RULE ] is changed to the original
2136    byte sequence as below:
2137         o the original byte sequence is B: [ B -1 ]
2138         o the original byte sequence is B1 B2: [ B1 B2 ]
2139
2140    Most of the routines are implemented by macros because many
2141    variables and labels in the caller decode_coding_emacs_mule must be
2142    accessible, and they are usually called just once (thus doesn't
2143    increase the size of compiled object).  */
2144
2145 /* Decode a composition rule represented by C as a component of
2146    composition sequence of Emacs 20 style.  Set RULE to the decoded
2147    rule. */
2148
2149 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2150   do {                                                  \
2151     int gref, nref;                                     \
2152                                                         \
2153     c -= 0xA0;                                          \
2154     if (c < 0 || c >= 81)                               \
2155       goto invalid_code;                                \
2156     gref = c / 9, nref = c % 9;                         \
2157     if (gref == 4) gref = 10;                           \
2158     if (nref == 4) nref = 10;                           \
2159     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2160   } while (0)
2161
2162
2163 /* Decode a composition rule represented by C and the following byte
2164    at SRC as a component of composition sequence of Emacs 21 style.
2165    Set RULE to the decoded rule.  */
2166
2167 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2168   do {                                                  \
2169     int gref, nref;                                     \
2170                                                         \
2171     gref = c - 0x20;                                    \
2172     if (gref < 0 || gref >= 81)                         \
2173       goto invalid_code;                                \
2174     ONE_MORE_BYTE (c);                                  \
2175     nref = c - 0x20;                                    \
2176     if (nref < 0 || nref >= 81)                         \
2177       goto invalid_code;                                \
2178     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2179   } while (0)
2180
2181
2182 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2183    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2184    byte length of this composition information, CHARS is the number of
2185    characters composed by this composition.  */
2186
2187 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2188   do {                                                                  \
2189     enum composition_method method = c - 0xF2;                          \
2190     int nbytes, nchars;                                                 \
2191                                                                         \
2192     ONE_MORE_BYTE (c);                                                  \
2193     if (c < 0)                                                          \
2194       goto invalid_code;                                                \
2195     nbytes = c - 0xA0;                                                  \
2196     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2197       goto invalid_code;                                                \
2198     ONE_MORE_BYTE (c);                                                  \
2199     nchars = c - 0xA0;                                                  \
2200     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2201       goto invalid_code;                                                \
2202     cmp_status->old_form = 0;                                           \
2203     cmp_status->method = method;                                        \
2204     if (method == COMPOSITION_RELATIVE)                                 \
2205       cmp_status->state = COMPOSING_CHAR;                               \
2206     else                                                                \
2207       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2208     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2209     cmp_status->nchars = nchars;                                        \
2210     cmp_status->ncomps = nbytes - 4;                                    \
2211     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2212   } while (0)
2213
2214
2215 /* Start of Emacs 20 style format for relative composition.  */
2216
2217 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2218   do {                                                          \
2219     cmp_status->old_form = 1;                                   \
2220     cmp_status->method = COMPOSITION_RELATIVE;                  \
2221     cmp_status->state = COMPOSING_CHAR;                         \
2222     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2223     cmp_status->nchars = cmp_status->ncomps = 0;                \
2224     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2225   } while (0)
2226
2227
2228 /* Start of Emacs 20 style format for rule-base composition.  */
2229
2230 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2231   do {                                                          \
2232     cmp_status->old_form = 1;                                   \
2233     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2234     cmp_status->state = COMPOSING_CHAR;                         \
2235     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2236     cmp_status->nchars = cmp_status->ncomps = 0;                \
2237     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2238   } while (0)
2239
2240
2241 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2242   do {                                                  \
2243     const unsigned char *current_src = src;             \
2244                                                         \
2245     ONE_MORE_BYTE (c);                                  \
2246     if (c < 0)                                          \
2247       goto invalid_code;                                \
2248     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2249         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2250       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2251     else if (c < 0xA0)                                  \
2252       goto invalid_code;                                \
2253     else if (c < 0xC0)                                  \
2254       {                                                 \
2255         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2256         /* Re-read C as a composition component.  */    \
2257         src = current_src;                              \
2258       }                                                 \
2259     else if (c == 0xFF)                                 \
2260       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2261     else                                                \
2262       goto invalid_code;                                \
2263   } while (0)
2264
2265 #define EMACS_MULE_COMPOSITION_END()                            \
2266   do {                                                          \
2267     int idx = - cmp_status->length;                             \
2268                                                                 \
2269     if (cmp_status->old_form)                                   \
2270       charbuf[idx + 2] = cmp_status->nchars;                    \
2271     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2272       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2273     cmp_status->state = COMPOSING_NO;                           \
2274   } while (0)
2275
2276
2277 static int
2278 emacs_mule_finish_composition (int *charbuf,
2279                                struct composition_status *cmp_status)
2280 {
2281   int idx = - cmp_status->length;
2282   int new_chars;
2283
2284   if (cmp_status->old_form && cmp_status->nchars > 0)
2285     {
2286       charbuf[idx + 2] = cmp_status->nchars;
2287       new_chars = 0;
2288       if (cmp_status->method == COMPOSITION_WITH_RULE
2289           && cmp_status->state == COMPOSING_CHAR)
2290         {
2291           /* The last rule was invalid.  */
2292           int rule = charbuf[-1] + 0xA0;
2293
2294           charbuf[-2] = BYTE8_TO_CHAR (rule);
2295           charbuf[-1] = -1;
2296           new_chars = 1;
2297         }
2298     }
2299   else
2300     {
2301       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2302
2303       if (cmp_status->method == COMPOSITION_WITH_RULE)
2304         {
2305           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2306           charbuf[idx++] = -3;
2307           charbuf[idx++] = 0;
2308           new_chars = 1;
2309         }
2310       else
2311         {
2312           int nchars = charbuf[idx + 1] + 0xA0;
2313           int nbytes = charbuf[idx + 2] + 0xA0;
2314
2315           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2317           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2318           charbuf[idx++] = -1;
2319           new_chars = 4;
2320         }
2321     }
2322   cmp_status->state = COMPOSING_NO;
2323   return new_chars;
2324 }
2325
2326 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2327   do {                                                                    \
2328     if (cmp_status->state != COMPOSING_NO)                                \
2329       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2330   } while (0)
2331
2332
2333 static void
2334 decode_coding_emacs_mule (struct coding_system *coding)
2335 {
2336   const unsigned char *src = coding->source + coding->consumed;
2337   const unsigned char *src_end = coding->source + coding->src_bytes;
2338   const unsigned char *src_base;
2339   int *charbuf = coding->charbuf + coding->charbuf_used;
2340   /* We may produce two annotations (charset and composition) in one
2341      loop and one more charset annotation at the end.  */
2342   int *charbuf_end
2343     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2344   int consumed_chars = 0, consumed_chars_base;
2345   int multibytep = coding->src_multibyte;
2346   Lisp_Object attrs, charset_list;
2347   int char_offset = coding->produced_char;
2348   int last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   int eol_dos =
2351     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   CODING_GET_INFO (coding, attrs, charset_list);
2356
2357   if (cmp_status->state != COMPOSING_NO)
2358     {
2359       int i;
2360
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c, id IF_LINT (= 0);
2369
2370       src_base = src;
2371       consumed_chars_base = consumed_chars;
2372
2373       if (charbuf >= charbuf_end)
2374         {
2375           if (byte_after_cr >= 0)
2376             src_base--;
2377           break;
2378         }
2379
2380       if (byte_after_cr >= 0)
2381         c = byte_after_cr, byte_after_cr = -1;
2382       else
2383         ONE_MORE_BYTE (c);
2384
2385       if (c < 0 || c == 0x80)
2386         {
2387           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2388           if (c < 0)
2389             {
2390               *charbuf++ = -c;
2391               char_offset++;
2392             }
2393           else
2394             DECODE_EMACS_MULE_COMPOSITION_START ();
2395           continue;
2396         }
2397
2398       if (c < 0x80)
2399         {
2400           if (eol_dos && c == '\r')
2401             ONE_MORE_BYTE (byte_after_cr);
2402           id = charset_ascii;
2403           if (cmp_status->state != COMPOSING_NO)
2404             {
2405               if (cmp_status->old_form)
2406                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2407               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2408                 cmp_status->ncomps--;
2409             }
2410         }
2411       else
2412         {
2413           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2414           /* emacs_mule_char can load a charset map from a file, which
2415              allocates a large structure and might cause buffer text
2416              to be relocated as result.  Thus, we need to remember the
2417              original pointer to buffer text, and fix up all related
2418              pointers after the call.  */
2419           const unsigned char *orig = coding->source;
2420           EMACS_INT offset;
2421
2422           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2423                                cmp_status);
2424           offset = coding->source - orig;
2425           if (offset)
2426             {
2427               src += offset;
2428               src_base += offset;
2429               src_end += offset;
2430             }
2431           if (c < 0)
2432             {
2433               if (c == -1)
2434                 goto invalid_code;
2435               if (c == -2)
2436                 break;
2437             }
2438           src = src_base + nbytes;
2439           consumed_chars = consumed_chars_base + nchars;
2440           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2441             cmp_status->ncomps -= nchars;
2442         }
2443
2444       /* Now if C >= 0, we found a normally encoded character, if C <
2445          0, we found an old-style composition component character or
2446          rule.  */
2447
2448       if (cmp_status->state == COMPOSING_NO)
2449         {
2450           if (last_id != id)
2451             {
2452               if (last_id != charset_ascii)
2453                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2454                                   last_id);
2455               last_id = id;
2456               last_offset = char_offset;
2457             }
2458           *charbuf++ = c;
2459           char_offset++;
2460         }
2461       else if (cmp_status->state == COMPOSING_CHAR)
2462         {
2463           if (cmp_status->old_form)
2464             {
2465               if (c >= 0)
2466                 {
2467                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2468                   *charbuf++ = c;
2469                   char_offset++;
2470                 }
2471               else
2472                 {
2473                   *charbuf++ = -c;
2474                   cmp_status->nchars++;
2475                   cmp_status->length++;
2476                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2477                     EMACS_MULE_COMPOSITION_END ();
2478                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2479                     cmp_status->state = COMPOSING_RULE;
2480                 }
2481             }
2482           else
2483             {
2484               *charbuf++ = c;
2485               cmp_status->length++;
2486               cmp_status->nchars--;
2487               if (cmp_status->nchars == 0)
2488                 EMACS_MULE_COMPOSITION_END ();
2489             }
2490         }
2491       else if (cmp_status->state == COMPOSING_RULE)
2492         {
2493           int rule;
2494
2495           if (c >= 0)
2496             {
2497               EMACS_MULE_COMPOSITION_END ();
2498               *charbuf++ = c;
2499               char_offset++;
2500             }
2501           else
2502             {
2503               c = -c;
2504               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2505               if (rule < 0)
2506                 goto invalid_code;
2507               *charbuf++ = -2;
2508               *charbuf++ = rule;
2509               cmp_status->length += 2;
2510               cmp_status->state = COMPOSING_CHAR;
2511             }
2512         }
2513       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2514         {
2515           *charbuf++ = c;
2516           cmp_status->length++;
2517           if (cmp_status->ncomps == 0)
2518             cmp_status->state = COMPOSING_CHAR;
2519           else if (cmp_status->ncomps > 0)
2520             {
2521               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2522                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2523             }
2524           else
2525             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2526         }
2527       else                      /* COMPOSING_COMPONENT_RULE */
2528         {
2529           int rule;
2530
2531           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2532           if (rule < 0)
2533             goto invalid_code;
2534           *charbuf++ = -2;
2535           *charbuf++ = rule;
2536           cmp_status->length += 2;
2537           cmp_status->ncomps--;
2538           if (cmp_status->ncomps > 0)
2539             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2540           else
2541             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2542         }
2543       continue;
2544
2545     invalid_code:
2546       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2547       src = src_base;
2548       consumed_chars = consumed_chars_base;
2549       ONE_MORE_BYTE (c);
2550       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2551       char_offset++;
2552       coding->errors++;
2553     }
2554
2555  no_more_source:
2556   if (cmp_status->state != COMPOSING_NO)
2557     {
2558       if (coding->mode & CODING_MODE_LAST_BLOCK)
2559         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560       else
2561         {
2562           int i;
2563
2564           charbuf -= cmp_status->length;
2565           for (i = 0; i < cmp_status->length; i++)
2566             cmp_status->carryover[i] = charbuf[i];
2567         }
2568     }
2569   if (last_id != charset_ascii)
2570     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2571   coding->consumed_char += consumed_chars_base;
2572   coding->consumed = src_base - coding->source;
2573   coding->charbuf_used = charbuf - coding->charbuf;
2574 }
2575
2576
2577 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2578   do {                                          \
2579     if (id < 0xA0)                              \
2580       codes[0] = id, codes[1] = 0;              \
2581     else if (id < 0xE0)                         \
2582       codes[0] = 0x9A, codes[1] = id;           \
2583     else if (id < 0xF0)                         \
2584       codes[0] = 0x9B, codes[1] = id;           \
2585     else if (id < 0xF5)                         \
2586       codes[0] = 0x9C, codes[1] = id;           \
2587     else                                        \
2588       codes[0] = 0x9D, codes[1] = id;           \
2589   } while (0);
2590
2591
2592 static int
2593 encode_coding_emacs_mule (struct coding_system *coding)
2594 {
2595   int multibytep = coding->dst_multibyte;
2596   int *charbuf = coding->charbuf;
2597   int *charbuf_end = charbuf + coding->charbuf_used;
2598   unsigned char *dst = coding->destination + coding->produced;
2599   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2600   int safe_room = 8;
2601   int produced_chars = 0;
2602   Lisp_Object attrs, charset_list;
2603   int c;
2604   int preferred_charset_id = -1;
2605
2606   CODING_GET_INFO (coding, attrs, charset_list);
2607   if (! EQ (charset_list, Vemacs_mule_charset_list))
2608     {
2609       CODING_ATTR_CHARSET_LIST (attrs)
2610         = charset_list = Vemacs_mule_charset_list;
2611     }
2612
2613   while (charbuf < charbuf_end)
2614     {
2615       ASSURE_DESTINATION (safe_room);
2616       c = *charbuf++;
2617
2618       if (c < 0)
2619         {
2620           /* Handle an annotation.  */
2621           switch (*charbuf)
2622             {
2623             case CODING_ANNOTATE_COMPOSITION_MASK:
2624               /* Not yet implemented.  */
2625               break;
2626             case CODING_ANNOTATE_CHARSET_MASK:
2627               preferred_charset_id = charbuf[3];
2628               if (preferred_charset_id >= 0
2629                   && NILP (Fmemq (make_number (preferred_charset_id),
2630                                   charset_list)))
2631                 preferred_charset_id = -1;
2632               break;
2633             default:
2634               abort ();
2635             }
2636           charbuf += -c - 1;
2637           continue;
2638         }
2639
2640       if (ASCII_CHAR_P (c))
2641         EMIT_ONE_ASCII_BYTE (c);
2642       else if (CHAR_BYTE8_P (c))
2643         {
2644           c = CHAR_TO_BYTE8 (c);
2645           EMIT_ONE_BYTE (c);
2646         }
2647       else
2648         {
2649           struct charset *charset;
2650           unsigned code;
2651           int dimension;
2652           int emacs_mule_id;
2653           unsigned char leading_codes[2];
2654
2655           if (preferred_charset_id >= 0)
2656             {
2657               charset = CHARSET_FROM_ID (preferred_charset_id);
2658               if (CHAR_CHARSET_P (c, charset))
2659                 code = ENCODE_CHAR (charset, c);
2660               else
2661                 charset = char_charset (c, charset_list, &code);
2662             }
2663           else
2664             charset = char_charset (c, charset_list, &code);
2665           if (! charset)
2666             {
2667               c = coding->default_char;
2668               if (ASCII_CHAR_P (c))
2669                 {
2670                   EMIT_ONE_ASCII_BYTE (c);
2671                   continue;
2672                 }
2673               charset = char_charset (c, charset_list, &code);
2674             }
2675           dimension = CHARSET_DIMENSION (charset);
2676           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2677           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2678           EMIT_ONE_BYTE (leading_codes[0]);
2679           if (leading_codes[1])
2680             EMIT_ONE_BYTE (leading_codes[1]);
2681           if (dimension == 1)
2682             EMIT_ONE_BYTE (code | 0x80);
2683           else
2684             {
2685               code |= 0x8080;
2686               EMIT_ONE_BYTE (code >> 8);
2687               EMIT_ONE_BYTE (code & 0xFF);
2688             }
2689         }
2690     }
2691   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2692   coding->produced_char += produced_chars;
2693   coding->produced = dst - coding->destination;
2694   return 0;
2695 }
2696
2697 \f
2698 /*** 7. ISO2022 handlers ***/
2699
2700 /* The following note describes the coding system ISO2022 briefly.
2701    Since the intention of this note is to help understand the
2702    functions in this file, some parts are NOT ACCURATE or are OVERLY
2703    SIMPLIFIED.  For thorough understanding, please refer to the
2704    original document of ISO2022.  This is equivalent to the standard
2705    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2706
2707    ISO2022 provides many mechanisms to encode several character sets
2708    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2709    is encoded using bytes less than 128.  This may make the encoded
2710    text a little bit longer, but the text passes more easily through
2711    several types of gateway, some of which strip off the MSB (Most
2712    Significant Bit).
2713
2714    There are two kinds of character sets: control character sets and
2715    graphic character sets.  The former contain control characters such
2716    as `newline' and `escape' to provide control functions (control
2717    functions are also provided by escape sequences).  The latter
2718    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2719    two control character sets and many graphic character sets.
2720
2721    Graphic character sets are classified into one of the following
2722    four classes, according to the number of bytes (DIMENSION) and
2723    number of characters in one dimension (CHARS) of the set:
2724    - DIMENSION1_CHARS94
2725    - DIMENSION1_CHARS96
2726    - DIMENSION2_CHARS94
2727    - DIMENSION2_CHARS96
2728
2729    In addition, each character set is assigned an identification tag,
2730    unique for each set, called the "final character" (denoted as <F>
2731    hereafter).  The <F> of each character set is decided by ECMA(*)
2732    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2733    (0x30..0x3F are for private use only).
2734
2735    Note (*): ECMA = European Computer Manufacturers Association
2736
2737    Here are examples of graphic character sets [NAME(<F>)]:
2738         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2739         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2740         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2741         o DIMENSION2_CHARS96 -- none for the moment
2742
2743    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2744         C0 [0x00..0x1F] -- control character plane 0
2745         GL [0x20..0x7F] -- graphic character plane 0
2746         C1 [0x80..0x9F] -- control character plane 1
2747         GR [0xA0..0xFF] -- graphic character plane 1
2748
2749    A control character set is directly designated and invoked to C0 or
2750    C1 by an escape sequence.  The most common case is that:
2751    - ISO646's  control character set is designated/invoked to C0, and
2752    - ISO6429's control character set is designated/invoked to C1,
2753    and usually these designations/invocations are omitted in encoded
2754    text.  In a 7-bit environment, only C0 can be used, and a control
2755    character for C1 is encoded by an appropriate escape sequence to
2756    fit into the environment.  All control characters for C1 are
2757    defined to have corresponding escape sequences.
2758
2759    A graphic character set is at first designated to one of four
2760    graphic registers (G0 through G3), then these graphic registers are
2761    invoked to GL or GR.  These designations and invocations can be
2762    done independently.  The most common case is that G0 is invoked to
2763    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2764    these invocations and designations are omitted in encoded text.
2765    In a 7-bit environment, only GL can be used.
2766
2767    When a graphic character set of CHARS94 is invoked to GL, codes
2768    0x20 and 0x7F of the GL area work as control characters SPACE and
2769    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2770    be used.
2771
2772    There are two ways of invocation: locking-shift and single-shift.
2773    With locking-shift, the invocation lasts until the next different
2774    invocation, whereas with single-shift, the invocation affects the
2775    following character only and doesn't affect the locking-shift
2776    state.  Invocations are done by the following control characters or
2777    escape sequences:
2778
2779    ----------------------------------------------------------------------
2780    abbrev  function                  cntrl escape seq   description
2781    ----------------------------------------------------------------------
2782    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2783    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2784    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2785    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2786    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2787    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2788    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2789    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2790    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2791    ----------------------------------------------------------------------
2792    (*) These are not used by any known coding system.
2793
2794    Control characters for these functions are defined by macros
2795    ISO_CODE_XXX in `coding.h'.
2796
2797    Designations are done by the following escape sequences:
2798    ----------------------------------------------------------------------
2799    escape sequence      description
2800    ----------------------------------------------------------------------
2801    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2802    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2803    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2804    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2805    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2806    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2807    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2808    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2809    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2810    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2811    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2812    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2813    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2814    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2815    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2816    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2817    ----------------------------------------------------------------------
2818
2819    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2820    of dimension 1, chars 94, and final character <F>, etc...
2821
2822    Note (*): Although these designations are not allowed in ISO2022,
2823    Emacs accepts them on decoding, and produces them on encoding
2824    CHARS96 character sets in a coding system which is characterized as
2825    7-bit environment, non-locking-shift, and non-single-shift.
2826
2827    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2828    '(' must be omitted.  We refer to this as "short-form" hereafter.
2829
2830    Now you may notice that there are a lot of ways of encoding the
2831    same multilingual text in ISO2022.  Actually, there exist many
2832    coding systems such as Compound Text (used in X11's inter client
2833    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2834    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2835    localized platforms), and all of these are variants of ISO2022.
2836
2837    In addition to the above, Emacs handles two more kinds of escape
2838    sequences: ISO6429's direction specification and Emacs' private
2839    sequence for specifying character composition.
2840
2841    ISO6429's direction specification takes the following form:
2842         o CSI ']'      -- end of the current direction
2843         o CSI '0' ']'  -- end of the current direction
2844         o CSI '1' ']'  -- start of left-to-right text
2845         o CSI '2' ']'  -- start of right-to-left text
2846    The control character CSI (0x9B: control sequence introducer) is
2847    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2848
2849    Character composition specification takes the following form:
2850         o ESC '0' -- start relative composition
2851         o ESC '1' -- end composition
2852         o ESC '2' -- start rule-base composition (*)
2853         o ESC '3' -- start relative composition with alternate chars  (**)
2854         o ESC '4' -- start rule-base composition with alternate chars  (**)
2855   Since these are not standard escape sequences of any ISO standard,
2856   the use of them with these meanings is restricted to Emacs only.
2857
2858   (*) This form is used only in Emacs 20.7 and older versions,
2859   but newer versions can safely decode it.
2860   (**) This form is used only in Emacs 21.1 and newer versions,
2861   and older versions can't decode it.
2862
2863   Here's a list of example usages of these composition escape
2864   sequences (categorized by `enum composition_method').
2865
2866   COMPOSITION_RELATIVE:
2867         ESC 0 CHAR [ CHAR ] ESC 1
2868   COMPOSITION_WITH_RULE:
2869         ESC 2 CHAR [ RULE CHAR ] ESC 1
2870   COMPOSITION_WITH_ALTCHARS:
2871         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2872   COMPOSITION_WITH_RULE_ALTCHARS:
2873         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2874
2875 enum iso_code_class_type iso_code_class[256];
2876
2877 #define SAFE_CHARSET_P(coding, id)      \
2878   ((id) <= (coding)->max_charset_id     \
2879    && (coding)->safe_charsets[id] != 255)
2880
2881 static void
2882 setup_iso_safe_charsets (Lisp_Object attrs)
2883 {
2884   Lisp_Object charset_list, safe_charsets;
2885   Lisp_Object request;
2886   Lisp_Object reg_usage;
2887   Lisp_Object tail;
2888   int reg94, reg96;
2889   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2890   int max_charset_id;
2891
2892   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2893   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2894       && ! EQ (charset_list, Viso_2022_charset_list))
2895     {
2896       CODING_ATTR_CHARSET_LIST (attrs)
2897         = charset_list = Viso_2022_charset_list;
2898       ASET (attrs, coding_attr_safe_charsets, Qnil);
2899     }
2900
2901   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2902     return;
2903
2904   max_charset_id = 0;
2905   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2906     {
2907       int id = XINT (XCAR (tail));
2908       if (max_charset_id < id)
2909         max_charset_id = id;
2910     }
2911
2912   safe_charsets = make_uninit_string (max_charset_id + 1);
2913   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2914   request = AREF (attrs, coding_attr_iso_request);
2915   reg_usage = AREF (attrs, coding_attr_iso_usage);
2916   reg94 = XINT (XCAR (reg_usage));
2917   reg96 = XINT (XCDR (reg_usage));
2918
2919   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2920     {
2921       Lisp_Object id;
2922       Lisp_Object reg;
2923       struct charset *charset;
2924
2925       id = XCAR (tail);
2926       charset = CHARSET_FROM_ID (XINT (id));
2927       reg = Fcdr (Fassq (id, request));
2928       if (! NILP (reg))
2929         SSET (safe_charsets, XINT (id), XINT (reg));
2930       else if (charset->iso_chars_96)
2931         {
2932           if (reg96 < 4)
2933             SSET (safe_charsets, XINT (id), reg96);
2934         }
2935       else
2936         {
2937           if (reg94 < 4)
2938             SSET (safe_charsets, XINT (id), reg94);
2939         }
2940     }
2941   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2942 }
2943
2944
2945 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2946    Check if a text is encoded in one of ISO-2022 based coding systems.
2947    If it is, return 1, else return 0.  */
2948
2949 static int
2950 detect_coding_iso_2022 (struct coding_system *coding,
2951                         struct coding_detection_info *detect_info)
2952 {
2953   const unsigned char *src = coding->source, *src_base = src;
2954   const unsigned char *src_end = coding->source + coding->src_bytes;
2955   int multibytep = coding->src_multibyte;
2956   int single_shifting = 0;
2957   int id;
2958   int c, c1;
2959   int consumed_chars = 0;
2960   int i;
2961   int rejected = 0;
2962   int found = 0;
2963   int composition_count = -1;
2964
2965   detect_info->checked |= CATEGORY_MASK_ISO;
2966
2967   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2968     {
2969       struct coding_system *this = &(coding_categories[i]);
2970       Lisp_Object attrs, val;
2971
2972       if (this->id < 0)
2973         continue;
2974       attrs = CODING_ID_ATTRS (this->id);
2975       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2976           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2977         setup_iso_safe_charsets (attrs);
2978       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2979       this->max_charset_id = SCHARS (val) - 1;
2980       this->safe_charsets = SDATA (val);
2981     }
2982
2983   /* A coding system of this category is always ASCII compatible.  */
2984   src += coding->head_ascii;
2985
2986   while (rejected != CATEGORY_MASK_ISO)
2987     {
2988       src_base = src;
2989       ONE_MORE_BYTE (c);
2990       switch (c)
2991         {
2992         case ISO_CODE_ESC:
2993           if (inhibit_iso_escape_detection)
2994             break;
2995           single_shifting = 0;
2996           ONE_MORE_BYTE (c);
2997           if (c == 'N' || c == 'O')
2998             {
2999               /* ESC <Fe> for SS2 or SS3.  */
3000               single_shifting = 1;
3001               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3002             }
3003           else if (c == '1')
3004             {
3005               /* End of composition.  */
3006               if (composition_count < 0
3007                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3008                 /* Invalid */
3009                 break;
3010               composition_count = -1;
3011               found |= CATEGORY_MASK_ISO;
3012             }
3013           else if (c >= '0' && c <= '4')
3014             {
3015               /* ESC <Fp> for start/end composition.  */
3016               composition_count = 0;
3017             }
3018           else
3019             {
3020               if (c >= '(' && c <= '/')
3021                 {
3022                   /* Designation sequence for a charset of dimension 1.  */
3023                   ONE_MORE_BYTE (c1);
3024                   if (c1 < ' ' || c1 >= 0x80
3025                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3026                     /* Invalid designation sequence.  Just ignore.  */
3027                     break;
3028                 }
3029               else if (c == '$')
3030                 {
3031                   /* Designation sequence for a charset of dimension 2.  */
3032                   ONE_MORE_BYTE (c);
3033                   if (c >= '@' && c <= 'B')
3034                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3035                     id = iso_charset_table[1][0][c];
3036                   else if (c >= '(' && c <= '/')
3037                     {
3038                       ONE_MORE_BYTE (c1);
3039                       if (c1 < ' ' || c1 >= 0x80
3040                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3041                         /* Invalid designation sequence.  Just ignore.  */
3042                         break;
3043                     }
3044                   else
3045                     /* Invalid designation sequence.  Just ignore it.  */
3046                     break;
3047                 }
3048               else
3049                 {
3050                   /* Invalid escape sequence.  Just ignore it.  */
3051                   break;
3052                 }
3053
3054               /* We found a valid designation sequence for CHARSET.  */
3055               rejected |= CATEGORY_MASK_ISO_8BIT;
3056               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3057                                   id))
3058                 found |= CATEGORY_MASK_ISO_7;
3059               else
3060                 rejected |= CATEGORY_MASK_ISO_7;
3061               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3062                                   id))
3063                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3064               else
3065                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3066               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3067                                   id))
3068                 found |= CATEGORY_MASK_ISO_7_ELSE;
3069               else
3070                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3071               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3072                                   id))
3073                 found |= CATEGORY_MASK_ISO_8_ELSE;
3074               else
3075                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3076             }
3077           break;
3078
3079         case ISO_CODE_SO:
3080         case ISO_CODE_SI:
3081           /* Locking shift out/in.  */
3082           if (inhibit_iso_escape_detection)
3083             break;
3084           single_shifting = 0;
3085           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3086           break;
3087
3088         case ISO_CODE_CSI:
3089           /* Control sequence introducer.  */
3090           single_shifting = 0;
3091           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3092           found |= CATEGORY_MASK_ISO_8_ELSE;
3093           goto check_extra_latin;
3094
3095         case ISO_CODE_SS2:
3096         case ISO_CODE_SS3:
3097           /* Single shift.   */
3098           if (inhibit_iso_escape_detection)
3099             break;
3100           single_shifting = 0;
3101           rejected |= CATEGORY_MASK_ISO_7BIT;
3102           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3103               & CODING_ISO_FLAG_SINGLE_SHIFT)
3104             {
3105               found |= CATEGORY_MASK_ISO_8_1;
3106               single_shifting = 1;
3107             }
3108           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3109               & CODING_ISO_FLAG_SINGLE_SHIFT)
3110             {
3111               found |= CATEGORY_MASK_ISO_8_2;
3112               single_shifting = 1;
3113             }
3114           if (single_shifting)
3115             break;
3116         check_extra_latin:
3117           if (! VECTORP (Vlatin_extra_code_table)
3118               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3119             {
3120               rejected = CATEGORY_MASK_ISO;
3121               break;
3122             }
3123           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3124               & CODING_ISO_FLAG_LATIN_EXTRA)
3125             found |= CATEGORY_MASK_ISO_8_1;
3126           else
3127             rejected |= CATEGORY_MASK_ISO_8_1;
3128           rejected |= CATEGORY_MASK_ISO_8_2;
3129           break;
3130
3131         default:
3132           if (c < 0)
3133             continue;
3134           if (c < 0x80)
3135             {
3136               if (composition_count >= 0)
3137                 composition_count++;
3138               single_shifting = 0;
3139               break;
3140             }
3141           if (c >= 0xA0)
3142             {
3143               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3144               found |= CATEGORY_MASK_ISO_8_1;
3145               /* Check the length of succeeding codes of the range
3146                  0xA0..0FF.  If the byte length is even, we include
3147                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3148                  only when we are not single shifting.  */
3149               if (! single_shifting
3150                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3151                 {
3152                   int len = 1;
3153                   while (src < src_end)
3154                     {
3155                       src_base = src;
3156                       ONE_MORE_BYTE (c);
3157                       if (c < 0xA0)
3158                         {
3159                           src = src_base;
3160                           break;
3161                         }
3162                       len++;
3163                     }
3164
3165                   if (len & 1 && src < src_end)
3166                     {
3167                       rejected |= CATEGORY_MASK_ISO_8_2;
3168                       if (composition_count >= 0)
3169                         composition_count += len;
3170                     }
3171                   else
3172                     {
3173                       found |= CATEGORY_MASK_ISO_8_2;
3174                       if (composition_count >= 0)
3175                         composition_count += len / 2;
3176                     }
3177                 }
3178               break;
3179             }
3180         }
3181     }
3182   detect_info->rejected |= CATEGORY_MASK_ISO;
3183   return 0;
3184
3185  no_more_source:
3186   detect_info->rejected |= rejected;
3187   detect_info->found |= (found & ~rejected);
3188   return 1;
3189 }
3190
3191
3192 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3193    escape sequence should be kept.  */
3194 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3195   do {                                                                  \
3196     int id, prev;                                                       \
3197                                                                         \
3198     if (final < '0' || final >= 128                                     \
3199         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3200         || !SAFE_CHARSET_P (coding, id))                                \
3201       {                                                                 \
3202         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3203         chars_96 = -1;                                                  \
3204         break;                                                          \
3205       }                                                                 \
3206     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3207     if (id == charset_jisx0201_roman)                                   \
3208       {                                                                 \
3209         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3210           id = charset_ascii;                                           \
3211       }                                                                 \
3212     else if (id == charset_jisx0208_1978)                               \
3213       {                                                                 \
3214         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3215           id = charset_jisx0208;                                        \
3216       }                                                                 \
3217     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3218     /* If there was an invalid designation to REG previously, and this  \
3219        designation is ASCII to REG, we should keep this designation     \
3220        sequence.  */                                                    \
3221     if (prev == -2 && id == charset_ascii)                              \
3222       chars_96 = -1;                                                    \
3223   } while (0)
3224
3225
3226 /* Handle these composition sequence (ALT: alternate char):
3227
3228    (1) relative composition: ESC 0 CHAR ... ESC 1
3229    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3230    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3231    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3232
3233    When the start sequence (ESC 0/2/3/4) is found, this annotation
3234    header is produced.
3235
3236         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3237
3238    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3239    produced until the end sequence (ESC 1) is found:
3240
3241    (1) CHAR ... CHAR
3242    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3243    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3244    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3245
3246    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3247    annotation header is updated as below:
3248
3249    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3250    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3251    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3252    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3253
3254    If an error is found while composing, the annotation header is
3255    changed to:
3256
3257         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3258
3259    and the sequence [ -2 DECODED-RULE ] is changed to the original
3260    byte sequence as below:
3261         o the original byte sequence is B: [ B -1 ]
3262         o the original byte sequence is B1 B2: [ B1 B2 ]
3263    and the sequence [ -1 -1 ] is changed to the original byte
3264    sequence:
3265         [ ESC '0' ]
3266 */
3267
3268 /* Decode a composition rule C1 and maybe one more byte from the
3269    source, and set RULE to the encoded composition rule, NBYTES to the
3270    length of the composition rule.  If the rule is invalid, set RULE
3271    to some negative value.  */
3272
3273 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3274   do {                                                                  \
3275     rule = c1 - 32;                                                     \
3276     if (rule < 0)                                                       \
3277       break;                                                            \
3278     if (rule < 81)              /* old format (before ver.21) */        \
3279       {                                                                 \
3280         int gref = (rule) / 9;                                          \
3281         int nref = (rule) % 9;                                          \
3282         if (gref == 4) gref = 10;                                       \
3283         if (nref == 4) nref = 10;                                       \
3284         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3285         nbytes = 1;                                                     \
3286       }                                                                 \
3287     else                        /* new format (after ver.21) */         \
3288       {                                                                 \
3289         int b;                                                          \
3290                                                                         \
3291         ONE_MORE_BYTE (b);                                              \
3292         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3293         if (rule >= 0)                                                  \
3294           rule += 0x100;   /* to destinguish it from the old format */  \
3295         nbytes = 2;                                                     \
3296       }                                                                 \
3297   } while (0)
3298
3299 #define ENCODE_COMPOSITION_RULE(rule)                           \
3300   do {                                                          \
3301     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3302                                                                 \
3303     if (rule < 0x100)           /* old format */                \
3304       {                                                         \
3305         if (gref == 10) gref = 4;                               \
3306         if (nref == 10) nref = 4;                               \
3307         charbuf[idx] = 32 + gref * 9 + nref;                    \
3308         charbuf[idx + 1] = -1;                                  \
3309         new_chars++;                                            \
3310       }                                                         \
3311     else                                /* new format */        \
3312       {                                                         \
3313         charbuf[idx] = 32 + 81 + gref;                          \
3314         charbuf[idx + 1] = 32 + nref;                           \
3315         new_chars += 2;                                         \
3316       }                                                         \
3317   } while (0)
3318
3319 /* Finish the current composition as invalid.  */
3320
3321 static int finish_composition (int *, struct composition_status *);
3322
3323 static int
3324 finish_composition (int *charbuf, struct composition_status *cmp_status)
3325 {
3326   int idx = - cmp_status->length;
3327   int new_chars;
3328
3329   /* Recover the original ESC sequence */
3330   charbuf[idx++] = ISO_CODE_ESC;
3331   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3332                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3333                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3334                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3335                     : '4');
3336   charbuf[idx++] = -2;
3337   charbuf[idx++] = 0;
3338   charbuf[idx++] = -1;
3339   new_chars = cmp_status->nchars;
3340   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3341     for (; idx < 0; idx++)
3342       {
3343         int elt = charbuf[idx];
3344
3345         if (elt == -2)
3346           {
3347             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3348             idx++;
3349           }
3350         else if (elt == -1)
3351           {
3352             charbuf[idx++] = ISO_CODE_ESC;
3353             charbuf[idx] = '0';
3354             new_chars += 2;
3355           }
3356       }
3357   cmp_status->state = COMPOSING_NO;
3358   return new_chars;
3359 }
3360
3361 /* If characters are under composition, finish the composition.  */
3362 #define MAYBE_FINISH_COMPOSITION()                              \
3363   do {                                                          \
3364     if (cmp_status->state != COMPOSING_NO)                      \
3365       char_offset += finish_composition (charbuf, cmp_status);  \
3366   } while (0)
3367
3368 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3369
3370    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3371    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3372    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3373    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3374
3375    Produce this annotation sequence now:
3376
3377    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3378 */
3379
3380 #define DECODE_COMPOSITION_START(c1)                                       \
3381   do {                                                                     \
3382     if (c1 == '0'                                                          \
3383         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3384              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3385             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3386                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3387       {                                                                    \
3388         *charbuf++ = -1;                                                   \
3389         *charbuf++= -1;                                                    \
3390         cmp_status->state = COMPOSING_CHAR;                                \
3391         cmp_status->length += 2;                                           \
3392       }                                                                    \
3393     else                                                                   \
3394       {                                                                    \
3395         MAYBE_FINISH_COMPOSITION ();                                       \
3396         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3397                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3398                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3399                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3400         cmp_status->state                                                  \
3401           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3402         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3403         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3404         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3405         coding->annotated = 1;                                             \
3406       }                                                                    \
3407   } while (0)
3408
3409
3410 /* Handle composition end sequence ESC 1.  */
3411
3412 #define DECODE_COMPOSITION_END()                                        \
3413   do {                                                                  \
3414     if (cmp_status->nchars == 0                                         \
3415         || ((cmp_status->state == COMPOSING_CHAR)                       \
3416             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3417       {                                                                 \
3418         MAYBE_FINISH_COMPOSITION ();                                    \
3419         goto invalid_code;                                              \
3420       }                                                                 \
3421     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3422       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3423     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3424       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3425     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3426     char_offset += cmp_status->nchars;                                  \
3427     cmp_status->state = COMPOSING_NO;                                   \
3428   } while (0)
3429
3430 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3431
3432 #define STORE_COMPOSITION_RULE(rule)    \
3433   do {                                  \
3434     *charbuf++ = -2;                    \
3435     *charbuf++ = rule;                  \
3436     cmp_status->length += 2;            \
3437     cmp_status->state--;                \
3438   } while (0)
3439
3440 /* Store a composed char or a component char C in charbuf, and update
3441    cmp_status.  */
3442
3443 #define STORE_COMPOSITION_CHAR(c)                                       \
3444   do {                                                                  \
3445     *charbuf++ = (c);                                                   \
3446     cmp_status->length++;                                               \
3447     if (cmp_status->state == COMPOSING_CHAR)                            \
3448       cmp_status->nchars++;                                             \
3449     else                                                                \
3450       cmp_status->ncomps++;                                             \
3451     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3452         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3453             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3454       cmp_status->state++;                                              \
3455   } while (0)
3456
3457
3458 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3459
3460 static void
3461 decode_coding_iso_2022 (struct coding_system *coding)
3462 {
3463   const unsigned char *src = coding->source + coding->consumed;
3464   const unsigned char *src_end = coding->source + coding->src_bytes;
3465   const unsigned char *src_base;
3466   int *charbuf = coding->charbuf + coding->charbuf_used;
3467   /* We may produce two annotations (charset and composition) in one
3468      loop and one more charset annotation at the end.  */
3469   int *charbuf_end
3470     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3471   int consumed_chars = 0, consumed_chars_base;
3472   int multibytep = coding->src_multibyte;
3473   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3474   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3475   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3476   int charset_id_2, charset_id_3;
3477   struct charset *charset;
3478   int c;
3479   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3480   Lisp_Object attrs, charset_list;
3481   int char_offset = coding->produced_char;
3482   int last_offset = char_offset;
3483   int last_id = charset_ascii;
3484   int eol_dos =
3485     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3486   int byte_after_cr = -1;
3487   int i;
3488
3489   CODING_GET_INFO (coding, attrs, charset_list);
3490   setup_iso_safe_charsets (attrs);
3491   /* Charset list may have been changed.  */
3492   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3493   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3494
3495   if (cmp_status->state != COMPOSING_NO)
3496     {
3497       for (i = 0; i < cmp_status->length; i++)
3498         *charbuf++ = cmp_status->carryover[i];
3499       coding->annotated = 1;
3500     }
3501
3502   while (1)
3503     {
3504       int c1, c2, c3;
3505
3506       src_base = src;
3507       consumed_chars_base = consumed_chars;
3508
3509       if (charbuf >= charbuf_end)
3510         {
3511           if (byte_after_cr >= 0)
3512             src_base--;
3513           break;
3514         }
3515
3516       if (byte_after_cr >= 0)
3517         c1 = byte_after_cr, byte_after_cr = -1;
3518       else
3519         ONE_MORE_BYTE (c1);
3520       if (c1 < 0)
3521         goto invalid_code;
3522
3523       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3524         {
3525           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3526           char_offset++;
3527           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3528           continue;
3529         }
3530
3531       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3532         {
3533           if (c1 == ISO_CODE_ESC)
3534             {
3535               if (src + 1 >= src_end)
3536                 goto no_more_source;
3537               *charbuf++ = ISO_CODE_ESC;
3538               char_offset++;
3539               if (src[0] == '%' && src[1] == '@')
3540                 {
3541                   src += 2;
3542                   consumed_chars += 2;
3543                   char_offset += 2;
3544                   /* We are sure charbuf can contain two more chars. */
3545                   *charbuf++ = '%';
3546                   *charbuf++ = '@';
3547                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3548                 }
3549             }
3550           else
3551             {
3552               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3553               char_offset++;
3554             }
3555           continue;
3556         }
3557
3558       if ((cmp_status->state == COMPOSING_RULE
3559            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3560           && c1 != ISO_CODE_ESC)
3561         {
3562           int rule, nbytes;
3563
3564           DECODE_COMPOSITION_RULE (rule, nbytes);
3565           if (rule < 0)
3566             goto invalid_code;
3567           STORE_COMPOSITION_RULE (rule);
3568           continue;
3569         }
3570
3571       /* We produce at most one character.  */
3572       switch (iso_code_class [c1])
3573         {
3574         case ISO_0x20_or_0x7F:
3575           if (charset_id_0 < 0
3576               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3577             /* This is SPACE or DEL.  */
3578             charset = CHARSET_FROM_ID (charset_ascii);
3579           else
3580             charset = CHARSET_FROM_ID (charset_id_0);
3581           break;
3582
3583         case ISO_graphic_plane_0:
3584           if (charset_id_0 < 0)
3585             charset = CHARSET_FROM_ID (charset_ascii);
3586           else
3587             charset = CHARSET_FROM_ID (charset_id_0);
3588           break;
3589
3590         case ISO_0xA0_or_0xFF:
3591           if (charset_id_1 < 0
3592               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3593               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3594             goto invalid_code;
3595           /* This is a graphic character, we fall down ... */
3596
3597         case ISO_graphic_plane_1:
3598           if (charset_id_1 < 0)
3599             goto invalid_code;
3600           charset = CHARSET_FROM_ID (charset_id_1);
3601           break;
3602
3603         case ISO_control_0:
3604           if (eol_dos && c1 == '\r')
3605             ONE_MORE_BYTE (byte_after_cr);
3606           MAYBE_FINISH_COMPOSITION ();
3607           charset = CHARSET_FROM_ID (charset_ascii);
3608           break;
3609
3610         case ISO_control_1:
3611           goto invalid_code;
3612
3613         case ISO_shift_out:
3614           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3615               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3616             goto invalid_code;
3617           CODING_ISO_INVOCATION (coding, 0) = 1;
3618           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3619           continue;
3620
3621         case ISO_shift_in:
3622           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3623             goto invalid_code;
3624           CODING_ISO_INVOCATION (coding, 0) = 0;
3625           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3626           continue;
3627
3628         case ISO_single_shift_2_7:
3629           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3630             goto invalid_code;
3631         case ISO_single_shift_2:
3632           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3633             goto invalid_code;
3634           /* SS2 is handled as an escape sequence of ESC 'N' */
3635           c1 = 'N';
3636           goto label_escape_sequence;
3637
3638         case ISO_single_shift_3:
3639           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3640             goto invalid_code;
3641           /* SS2 is handled as an escape sequence of ESC 'O' */
3642           c1 = 'O';
3643           goto label_escape_sequence;
3644
3645         case ISO_control_sequence_introducer:
3646           /* CSI is handled as an escape sequence of ESC '[' ...  */
3647           c1 = '[';
3648           goto label_escape_sequence;
3649
3650         case ISO_escape:
3651           ONE_MORE_BYTE (c1);
3652         label_escape_sequence:
3653           /* Escape sequences handled here are invocation,
3654              designation, direction specification, and character
3655              composition specification.  */
3656           switch (c1)
3657             {
3658             case '&':           /* revision of following character set */
3659               ONE_MORE_BYTE (c1);
3660               if (!(c1 >= '@' && c1 <= '~'))
3661                 goto invalid_code;
3662               ONE_MORE_BYTE (c1);
3663               if (c1 != ISO_CODE_ESC)
3664                 goto invalid_code;
3665               ONE_MORE_BYTE (c1);
3666               goto label_escape_sequence;
3667
3668             case '$':           /* designation of 2-byte character set */
3669               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3670                 goto invalid_code;
3671               {
3672                 int reg, chars96;
3673
3674                 ONE_MORE_BYTE (c1);
3675                 if (c1 >= '@' && c1 <= 'B')
3676                   {     /* designation of JISX0208.1978, GB2312.1980,
3677                            or JISX0208.1980 */
3678                     reg = 0, chars96 = 0;
3679                   }
3680                 else if (c1 >= 0x28 && c1 <= 0x2B)
3681                   { /* designation of DIMENSION2_CHARS94 character set */
3682                     reg = c1 - 0x28, chars96 = 0;
3683                     ONE_MORE_BYTE (c1);
3684                   }
3685                 else if (c1 >= 0x2C && c1 <= 0x2F)
3686                   { /* designation of DIMENSION2_CHARS96 character set */
3687                     reg = c1 - 0x2C, chars96 = 1;
3688                     ONE_MORE_BYTE (c1);
3689                   }
3690                 else
3691                   goto invalid_code;
3692                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3693                 /* We must update these variables now.  */
3694                 if (reg == 0)
3695                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3696                 else if (reg == 1)
3697                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3698                 if (chars96 < 0)
3699                   goto invalid_code;
3700               }
3701               continue;
3702
3703             case 'n':           /* invocation of locking-shift-2 */
3704               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3705                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3706                 goto invalid_code;
3707               CODING_ISO_INVOCATION (coding, 0) = 2;
3708               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3709               continue;
3710
3711             case 'o':           /* invocation of locking-shift-3 */
3712               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3713                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3714                 goto invalid_code;
3715               CODING_ISO_INVOCATION (coding, 0) = 3;
3716               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3717               continue;
3718
3719             case 'N':           /* invocation of single-shift-2 */
3720               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3721                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3722                 goto invalid_code;
3723               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3724               if (charset_id_2 < 0)
3725                 charset = CHARSET_FROM_ID (charset_ascii);
3726               else
3727                 charset = CHARSET_FROM_ID (charset_id_2);
3728               ONE_MORE_BYTE (c1);
3729               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3730                 goto invalid_code;
3731               break;
3732
3733             case 'O':           /* invocation of single-shift-3 */
3734               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3735                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3736                 goto invalid_code;
3737               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3738               if (charset_id_3 < 0)
3739                 charset = CHARSET_FROM_ID (charset_ascii);
3740               else
3741                 charset = CHARSET_FROM_ID (charset_id_3);
3742               ONE_MORE_BYTE (c1);
3743               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3744                 goto invalid_code;
3745               break;
3746
3747             case '0': case '2': case '3': case '4': /* start composition */
3748               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3749                 goto invalid_code;
3750               if (last_id != charset_ascii)
3751                 {
3752                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3753                   last_id = charset_ascii;
3754                   last_offset = char_offset;
3755                 }
3756               DECODE_COMPOSITION_START (c1);
3757               continue;
3758
3759             case '1':           /* end composition */
3760               if (cmp_status->state == COMPOSING_NO)
3761                 goto invalid_code;
3762               DECODE_COMPOSITION_END ();
3763               continue;
3764
3765             case '[':           /* specification of direction */
3766               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3767                 goto invalid_code;
3768               /* For the moment, nested direction is not supported.
3769                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3770                  left-to-right, and nonzero means right-to-left.  */
3771               ONE_MORE_BYTE (c1);
3772               switch (c1)
3773                 {
3774                 case ']':       /* end of the current direction */
3775                   coding->mode &= ~CODING_MODE_DIRECTION;
3776
3777                 case '0':       /* end of the current direction */
3778                 case '1':       /* start of left-to-right direction */
3779                   ONE_MORE_BYTE (c1);
3780                   if (c1 == ']')
3781                     coding->mode &= ~CODING_MODE_DIRECTION;
3782                   else
3783                     goto invalid_code;
3784                   break;
3785
3786                 case '2':       /* start of right-to-left direction */
3787                   ONE_MORE_BYTE (c1);
3788                   if (c1 == ']')
3789                     coding->mode |= CODING_MODE_DIRECTION;
3790                   else
3791                     goto invalid_code;
3792                   break;
3793
3794                 default:
3795                   goto invalid_code;
3796                 }
3797               continue;
3798
3799             case '%':
3800               ONE_MORE_BYTE (c1);
3801               if (c1 == '/')
3802                 {
3803                   /* CTEXT extended segment:
3804                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3805                      We keep these bytes as is for the moment.
3806                      They may be decoded by post-read-conversion.  */
3807                   int dim, M, L;
3808                   int size;
3809
3810                   ONE_MORE_BYTE (dim);
3811                   if (dim < '0' || dim > '4')
3812                     goto invalid_code;
3813                   ONE_MORE_BYTE (M);
3814                   if (M < 128)
3815                     goto invalid_code;
3816                   ONE_MORE_BYTE (L);
3817                   if (L < 128)
3818                     goto invalid_code;
3819                   size = ((M - 128) * 128) + (L - 128);
3820                   if (charbuf + 6 > charbuf_end)
3821                     goto break_loop;
3822                   *charbuf++ = ISO_CODE_ESC;
3823                   *charbuf++ = '%';
3824                   *charbuf++ = '/';
3825                   *charbuf++ = dim;
3826                   *charbuf++ = BYTE8_TO_CHAR (M);
3827                   *charbuf++ = BYTE8_TO_CHAR (L);
3828                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3829                 }
3830               else if (c1 == 'G')
3831                 {
3832                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3833                      ESC % G --UTF-8-BYTES-- ESC % @
3834                      We keep these bytes as is for the moment.
3835                      They may be decoded by post-read-conversion.  */
3836                   if (charbuf + 3 > charbuf_end)
3837                     goto break_loop;
3838                   *charbuf++ = ISO_CODE_ESC;
3839                   *charbuf++ = '%';
3840                   *charbuf++ = 'G';
3841                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3842                 }
3843               else
3844                 goto invalid_code;
3845               continue;
3846               break;
3847
3848             default:
3849               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3850                 goto invalid_code;
3851               {
3852                 int reg, chars96;
3853
3854                 if (c1 >= 0x28 && c1 <= 0x2B)
3855                   { /* designation of DIMENSION1_CHARS94 character set */
3856                     reg = c1 - 0x28, chars96 = 0;
3857                     ONE_MORE_BYTE (c1);
3858                   }
3859                 else if (c1 >= 0x2C && c1 <= 0x2F)
3860                   { /* designation of DIMENSION1_CHARS96 character set */
3861                     reg = c1 - 0x2C, chars96 = 1;
3862                     ONE_MORE_BYTE (c1);
3863                   }
3864                 else
3865                   goto invalid_code;
3866                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3867                 /* We must update these variables now.  */
3868                 if (reg == 0)
3869                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3870                 else if (reg == 1)
3871                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3872                 if (chars96 < 0)
3873                   goto invalid_code;
3874               }
3875               continue;
3876             }
3877           break;
3878
3879         default:
3880           abort ();
3881         }
3882
3883       if (cmp_status->state == COMPOSING_NO
3884           && charset->id != charset_ascii
3885           && last_id != charset->id)
3886         {
3887           if (last_id != charset_ascii)
3888             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3889           last_id = charset->id;
3890           last_offset = char_offset;
3891         }
3892
3893       /* Now we know CHARSET and 1st position code C1 of a character.
3894          Produce a decoded character while getting 2nd and 3rd
3895          position codes C2, C3 if necessary.  */
3896       if (CHARSET_DIMENSION (charset) > 1)
3897         {
3898           ONE_MORE_BYTE (c2);
3899           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3900               || ((c1 & 0x80) != (c2 & 0x80)))
3901             /* C2 is not in a valid range.  */
3902             goto invalid_code;
3903           if (CHARSET_DIMENSION (charset) == 2)
3904             c1 = (c1 << 8) | c2;
3905           else
3906             {
3907               ONE_MORE_BYTE (c3);
3908               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3909                   || ((c1 & 0x80) != (c3 & 0x80)))
3910                 /* C3 is not in a valid range.  */
3911                 goto invalid_code;
3912               c1 = (c1 << 16) | (c2 << 8) | c2;
3913             }
3914         }
3915       c1 &= 0x7F7F7F;
3916       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3917       if (c < 0)
3918         {
3919           MAYBE_FINISH_COMPOSITION ();
3920           for (; src_base < src; src_base++, char_offset++)
3921             {
3922               if (ASCII_BYTE_P (*src_base))
3923                 *charbuf++ = *src_base;
3924               else
3925                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3926             }
3927         }
3928       else if (cmp_status->state == COMPOSING_NO)
3929         {
3930           *charbuf++ = c;
3931           char_offset++;
3932         }
3933       else if ((cmp_status->state == COMPOSING_CHAR
3934                 ? cmp_status->nchars
3935                 : cmp_status->ncomps)
3936                >= MAX_COMPOSITION_COMPONENTS)
3937         {
3938           /* Too long composition.  */
3939           MAYBE_FINISH_COMPOSITION ();
3940           *charbuf++ = c;
3941           char_offset++;
3942         }
3943       else
3944         STORE_COMPOSITION_CHAR (c);
3945       continue;
3946
3947     invalid_code:
3948       MAYBE_FINISH_COMPOSITION ();
3949       src = src_base;
3950       consumed_chars = consumed_chars_base;
3951       ONE_MORE_BYTE (c);
3952       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3953       char_offset++;
3954       coding->errors++;
3955       continue;
3956
3957     break_loop:
3958       break;
3959     }
3960
3961  no_more_source:
3962   if (cmp_status->state != COMPOSING_NO)
3963     {
3964       if (coding->mode & CODING_MODE_LAST_BLOCK)
3965         MAYBE_FINISH_COMPOSITION ();
3966       else
3967         {
3968           charbuf -= cmp_status->length;
3969           for (i = 0; i < cmp_status->length; i++)
3970             cmp_status->carryover[i] = charbuf[i];
3971         }
3972     }
3973   else if (last_id != charset_ascii)
3974     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3975   coding->consumed_char += consumed_chars_base;
3976   coding->consumed = src_base - coding->source;
3977   coding->charbuf_used = charbuf - coding->charbuf;
3978 }
3979
3980
3981 /* ISO2022 encoding stuff.  */
3982
3983 /*
3984    It is not enough to say just "ISO2022" on encoding, we have to
3985    specify more details.  In Emacs, each coding system of ISO2022
3986    variant has the following specifications:
3987         1. Initial designation to G0 thru G3.
3988         2. Allows short-form designation?
3989         3. ASCII should be designated to G0 before control characters?
3990         4. ASCII should be designated to G0 at end of line?
3991         5. 7-bit environment or 8-bit environment?
3992         6. Use locking-shift?
3993         7. Use Single-shift?
3994    And the following two are only for Japanese:
3995         8. Use ASCII in place of JIS0201-1976-Roman?
3996         9. Use JISX0208-1983 in place of JISX0208-1978?
3997    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3998    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3999    details.
4000 */
4001
4002 /* Produce codes (escape sequence) for designating CHARSET to graphic
4003    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4004    '@', 'A', or 'B' and the coding system CODING allows, produce
4005    designation sequence of short-form.  */
4006
4007 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4008   do {                                                                  \
4009     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4010     const char *intermediate_char_94 = "()*+";                          \
4011     const char *intermediate_char_96 = ",-./";                          \
4012     int revision = -1;                                                  \
4013                                                                         \
4014     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4015       revision = CHARSET_ISO_REVISION (charset);                        \
4016                                                                         \
4017     if (revision >= 0)                                                  \
4018       {                                                                 \
4019         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4020         EMIT_ONE_BYTE ('@' + revision);                                 \
4021       }                                                                 \
4022     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4023     if (CHARSET_DIMENSION (charset) == 1)                               \
4024       {                                                                 \
4025         int b;                                                          \
4026         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4027           b = intermediate_char_94[reg];                                \
4028         else                                                            \
4029           b = intermediate_char_96[reg];                                \
4030         EMIT_ONE_ASCII_BYTE (b);                                        \
4031       }                                                                 \
4032     else                                                                \
4033       {                                                                 \
4034         EMIT_ONE_ASCII_BYTE ('$');                                      \
4035         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4036           {                                                             \
4037             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4038                 || reg != 0                                             \
4039                 || final_char < '@' || final_char > 'B')                \
4040               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4041           }                                                             \
4042         else                                                            \
4043           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4044       }                                                                 \
4045     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4046                                                                         \
4047     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4048   } while (0)
4049
4050
4051 /* The following two macros produce codes (control character or escape
4052    sequence) for ISO2022 single-shift functions (single-shift-2 and
4053    single-shift-3).  */
4054
4055 #define ENCODE_SINGLE_SHIFT_2                                           \
4056   do {                                                                  \
4057     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4058       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4059     else                                                                \
4060       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4061     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4062   } while (0)
4063
4064
4065 #define ENCODE_SINGLE_SHIFT_3                                           \
4066   do {                                                                  \
4067     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4068       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4069     else                                                                \
4070       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4071     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4072   } while (0)
4073
4074
4075 /* The following four macros produce codes (control character or
4076    escape sequence) for ISO2022 locking-shift functions (shift-in,
4077    shift-out, locking-shift-2, and locking-shift-3).  */
4078
4079 #define ENCODE_SHIFT_IN                                 \
4080   do {                                                  \
4081     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4082     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4083   } while (0)
4084
4085
4086 #define ENCODE_SHIFT_OUT                                \
4087   do {                                                  \
4088     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4089     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4090   } while (0)
4091
4092
4093 #define ENCODE_LOCKING_SHIFT_2                          \
4094   do {                                                  \
4095     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4096     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4097   } while (0)
4098
4099
4100 #define ENCODE_LOCKING_SHIFT_3                          \
4101   do {                                                  \
4102     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4103     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4104   } while (0)
4105
4106
4107 /* Produce codes for a DIMENSION1 character whose character set is
4108    CHARSET and whose position-code is C1.  Designation and invocation
4109    sequences are also produced in advance if necessary.  */
4110
4111 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4112   do {                                                                  \
4113     int id = CHARSET_ID (charset);                                      \
4114                                                                         \
4115     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4116         && id == charset_ascii)                                         \
4117       {                                                                 \
4118         id = charset_jisx0201_roman;                                    \
4119         charset = CHARSET_FROM_ID (id);                                 \
4120       }                                                                 \
4121                                                                         \
4122     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4123       {                                                                 \
4124         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4125           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4126         else                                                            \
4127           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4128         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4129         break;                                                          \
4130       }                                                                 \
4131     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4132       {                                                                 \
4133         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4134         break;                                                          \
4135       }                                                                 \
4136     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4137       {                                                                 \
4138         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4139         break;                                                          \
4140       }                                                                 \
4141     else                                                                \
4142       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4143          must invoke it, or, at first, designate it to some graphic     \
4144          register.  Then repeat the loop to actually produce the        \
4145          character.  */                                                 \
4146       dst = encode_invocation_designation (charset, coding, dst,        \
4147                                            &produced_chars);            \
4148   } while (1)
4149
4150
4151 /* Produce codes for a DIMENSION2 character whose character set is
4152    CHARSET and whose position-codes are C1 and C2.  Designation and
4153    invocation codes are also produced in advance if necessary.  */
4154
4155 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4156   do {                                                                  \
4157     int id = CHARSET_ID (charset);                                      \
4158                                                                         \
4159     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4160         && id == charset_jisx0208)                                      \
4161       {                                                                 \
4162         id = charset_jisx0208_1978;                                     \
4163         charset = CHARSET_FROM_ID (id);                                 \
4164       }                                                                 \
4165                                                                         \
4166     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4167       {                                                                 \
4168         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4169           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4170         else                                                            \
4171           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4172         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4173         break;                                                          \
4174       }                                                                 \
4175     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4176       {                                                                 \
4177         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4178         break;                                                          \
4179       }                                                                 \
4180     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4181       {                                                                 \
4182         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4183         break;                                                          \
4184       }                                                                 \
4185     else                                                                \
4186       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4187          must invoke it, or, at first, designate it to some graphic     \
4188          register.  Then repeat the loop to actually produce the        \
4189          character.  */                                                 \
4190       dst = encode_invocation_designation (charset, coding, dst,        \
4191                                            &produced_chars);            \
4192   } while (1)
4193
4194
4195 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4196   do {                                                                     \
4197     int code = ENCODE_CHAR ((charset), (c));                               \
4198                                                                            \
4199     if (CHARSET_DIMENSION (charset) == 1)                                  \
4200       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4201     else                                                                   \
4202       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4203   } while (0)
4204
4205
4206 /* Produce designation and invocation codes at a place pointed by DST
4207    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4208    Return new DST.  */
4209
4210 static unsigned char *
4211 encode_invocation_designation (struct charset *charset,
4212                                struct coding_system *coding,
4213                                unsigned char *dst, int *p_nchars)
4214 {
4215   int multibytep = coding->dst_multibyte;
4216   int produced_chars = *p_nchars;
4217   int reg;                      /* graphic register number */
4218   int id = CHARSET_ID (charset);
4219
4220   /* At first, check designations.  */
4221   for (reg = 0; reg < 4; reg++)
4222     if (id == CODING_ISO_DESIGNATION (coding, reg))
4223       break;
4224
4225   if (reg >= 4)
4226     {
4227       /* CHARSET is not yet designated to any graphic registers.  */
4228       /* At first check the requested designation.  */
4229       reg = CODING_ISO_REQUEST (coding, id);
4230       if (reg < 0)
4231         /* Since CHARSET requests no special designation, designate it
4232            to graphic register 0.  */
4233         reg = 0;
4234
4235       ENCODE_DESIGNATION (charset, reg, coding);
4236     }
4237
4238   if (CODING_ISO_INVOCATION (coding, 0) != reg
4239       && CODING_ISO_INVOCATION (coding, 1) != reg)
4240     {
4241       /* Since the graphic register REG is not invoked to any graphic
4242          planes, invoke it to graphic plane 0.  */
4243       switch (reg)
4244         {
4245         case 0:                 /* graphic register 0 */
4246           ENCODE_SHIFT_IN;
4247           break;
4248
4249         case 1:                 /* graphic register 1 */
4250           ENCODE_SHIFT_OUT;
4251           break;
4252
4253         case 2:                 /* graphic register 2 */
4254           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4255             ENCODE_SINGLE_SHIFT_2;
4256           else
4257             ENCODE_LOCKING_SHIFT_2;
4258           break;
4259
4260         case 3:                 /* graphic register 3 */
4261           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4262             ENCODE_SINGLE_SHIFT_3;
4263           else
4264             ENCODE_LOCKING_SHIFT_3;
4265           break;
4266         }
4267     }
4268
4269   *p_nchars = produced_chars;
4270   return dst;
4271 }
4272
4273
4274 /* Produce codes for designation and invocation to reset the graphic
4275    planes and registers to initial state.  */
4276 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4277   do {                                                                  \
4278     int reg;                                                            \
4279     struct charset *charset;                                            \
4280                                                                         \
4281     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4282       ENCODE_SHIFT_IN;                                                  \
4283     for (reg = 0; reg < 4; reg++)                                       \
4284       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4285           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4286               != CODING_ISO_INITIAL (coding, reg)))                     \
4287         {                                                               \
4288           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4289           ENCODE_DESIGNATION (charset, reg, coding);                    \
4290         }                                                               \
4291   } while (0)
4292
4293
4294 /* Produce designation sequences of charsets in the line started from
4295    SRC to a place pointed by DST, and return updated DST.
4296
4297    If the current block ends before any end-of-line, we may fail to
4298    find all the necessary designations.  */
4299
4300 static unsigned char *
4301 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4302                            int *charbuf_end, unsigned char *dst)
4303 {
4304   struct charset *charset;
4305   /* Table of charsets to be designated to each graphic register.  */
4306   int r[4];
4307   int c, found = 0, reg;
4308   int produced_chars = 0;
4309   int multibytep = coding->dst_multibyte;
4310   Lisp_Object attrs;
4311   Lisp_Object charset_list;
4312
4313   attrs = CODING_ID_ATTRS (coding->id);
4314   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4315   if (EQ (charset_list, Qiso_2022))
4316     charset_list = Viso_2022_charset_list;
4317
4318   for (reg = 0; reg < 4; reg++)
4319     r[reg] = -1;
4320
4321   while (found < 4)
4322     {
4323       int id;
4324
4325       c = *charbuf++;
4326       if (c == '\n')
4327         break;
4328       charset = char_charset (c, charset_list, NULL);
4329       id = CHARSET_ID (charset);
4330       reg = CODING_ISO_REQUEST (coding, id);
4331       if (reg >= 0 && r[reg] < 0)
4332         {
4333           found++;
4334           r[reg] = id;
4335         }
4336     }
4337
4338   if (found)
4339     {
4340       for (reg = 0; reg < 4; reg++)
4341         if (r[reg] >= 0
4342             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4343           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4344     }
4345
4346   return dst;
4347 }
4348
4349 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4350
4351 static int
4352 encode_coding_iso_2022 (struct coding_system *coding)
4353 {
4354   int multibytep = coding->dst_multibyte;
4355   int *charbuf = coding->charbuf;
4356   int *charbuf_end = charbuf + coding->charbuf_used;
4357   unsigned char *dst = coding->destination + coding->produced;
4358   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4359   int safe_room = 16;
4360   int bol_designation
4361     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4362        && CODING_ISO_BOL (coding));
4363   int produced_chars = 0;
4364   Lisp_Object attrs, eol_type, charset_list;
4365   int ascii_compatible;
4366   int c;
4367   int preferred_charset_id = -1;
4368
4369   CODING_GET_INFO (coding, attrs, charset_list);
4370   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4371   if (VECTORP (eol_type))
4372     eol_type = Qunix;
4373
4374   setup_iso_safe_charsets (attrs);
4375   /* Charset list may have been changed.  */
4376   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4377   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4378
4379   ascii_compatible
4380     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4381        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4382                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4383
4384   while (charbuf < charbuf_end)
4385     {
4386       ASSURE_DESTINATION (safe_room);
4387
4388       if (bol_designation)
4389         {
4390           unsigned char *dst_prev = dst;
4391
4392           /* We have to produce designation sequences if any now.  */
4393           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4394           bol_designation = 0;
4395           /* We are sure that designation sequences are all ASCII bytes.  */
4396           produced_chars += dst - dst_prev;
4397         }
4398
4399       c = *charbuf++;
4400
4401       if (c < 0)
4402         {
4403           /* Handle an annotation.  */
4404           switch (*charbuf)
4405             {
4406             case CODING_ANNOTATE_COMPOSITION_MASK:
4407               /* Not yet implemented.  */
4408               break;
4409             case CODING_ANNOTATE_CHARSET_MASK:
4410               preferred_charset_id = charbuf[2];
4411               if (preferred_charset_id >= 0
4412                   && NILP (Fmemq (make_number (preferred_charset_id),
4413                                   charset_list)))
4414                 preferred_charset_id = -1;
4415               break;
4416             default:
4417               abort ();
4418             }
4419           charbuf += -c - 1;
4420           continue;
4421         }
4422
4423       /* Now encode the character C.  */
4424       if (c < 0x20 || c == 0x7F)
4425         {
4426           if (c == '\n'
4427               || (c == '\r' && EQ (eol_type, Qmac)))
4428             {
4429               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4430                 ENCODE_RESET_PLANE_AND_REGISTER ();
4431               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4432                 {
4433                   int i;
4434
4435                   for (i = 0; i < 4; i++)
4436                     CODING_ISO_DESIGNATION (coding, i)
4437                       = CODING_ISO_INITIAL (coding, i);
4438                 }
4439               bol_designation
4440                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4441             }
4442           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4443             ENCODE_RESET_PLANE_AND_REGISTER ();
4444           EMIT_ONE_ASCII_BYTE (c);
4445         }
4446       else if (ASCII_CHAR_P (c))
4447         {
4448           if (ascii_compatible)
4449             EMIT_ONE_ASCII_BYTE (c);
4450           else
4451             {
4452               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4453               ENCODE_ISO_CHARACTER (charset, c);
4454             }
4455         }
4456       else if (CHAR_BYTE8_P (c))
4457         {
4458           c = CHAR_TO_BYTE8 (c);
4459           EMIT_ONE_BYTE (c);
4460         }
4461       else
4462         {
4463           struct charset *charset;
4464
4465           if (preferred_charset_id >= 0)
4466             {
4467               charset = CHARSET_FROM_ID (preferred_charset_id);
4468               if (! CHAR_CHARSET_P (c, charset))
4469                 charset = char_charset (c, charset_list, NULL);
4470             }
4471           else
4472             charset = char_charset (c, charset_list, NULL);
4473           if (!charset)
4474             {
4475               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4476                 {
4477                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4478                   charset = CHARSET_FROM_ID (charset_ascii);
4479                 }
4480               else
4481                 {
4482                   c = coding->default_char;
4483                   charset = char_charset (c, charset_list, NULL);
4484                 }
4485             }
4486           ENCODE_ISO_CHARACTER (charset, c);
4487         }
4488     }
4489
4490   if (coding->mode & CODING_MODE_LAST_BLOCK
4491       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4492     {
4493       ASSURE_DESTINATION (safe_room);
4494       ENCODE_RESET_PLANE_AND_REGISTER ();
4495     }
4496   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4497   CODING_ISO_BOL (coding) = bol_designation;
4498   coding->produced_char += produced_chars;
4499   coding->produced = dst - coding->destination;
4500   return 0;
4501 }
4502
4503 \f
4504 /*** 8,9. SJIS and BIG5 handlers ***/
4505
4506 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4507    quite widely.  So, for the moment, Emacs supports them in the bare
4508    C code.  But, in the future, they may be supported only by CCL.  */
4509
4510 /* SJIS is a coding system encoding three character sets: ASCII, right
4511    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4512    as is.  A character of charset katakana-jisx0201 is encoded by
4513    "position-code + 0x80".  A character of charset japanese-jisx0208
4514    is encoded in 2-byte but two position-codes are divided and shifted
4515    so that it fit in the range below.
4516
4517    --- CODE RANGE of SJIS ---
4518    (character set)      (range)
4519    ASCII                0x00 .. 0x7F
4520    KATAKANA-JISX0201    0xA0 .. 0xDF
4521    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4522             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4523    -------------------------------
4524
4525 */
4526
4527 /* BIG5 is a coding system encoding two character sets: ASCII and
4528    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4529    character set and is encoded in two-byte.
4530
4531    --- CODE RANGE of BIG5 ---
4532    (character set)      (range)
4533    ASCII                0x00 .. 0x7F
4534    Big5 (1st byte)      0xA1 .. 0xFE
4535         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4536    --------------------------
4537
4538   */
4539
4540 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4541    Check if a text is encoded in SJIS.  If it is, return
4542    CATEGORY_MASK_SJIS, else return 0.  */
4543
4544 static int
4545 detect_coding_sjis (struct coding_system *coding,
4546                     struct coding_detection_info *detect_info)
4547 {
4548   const unsigned char *src = coding->source, *src_base;
4549   const unsigned char *src_end = coding->source + coding->src_bytes;
4550   int multibytep = coding->src_multibyte;
4551   int consumed_chars = 0;
4552   int found = 0;
4553   int c;
4554   Lisp_Object attrs, charset_list;
4555   int max_first_byte_of_2_byte_code;
4556
4557   CODING_GET_INFO (coding, attrs, charset_list);
4558   max_first_byte_of_2_byte_code
4559     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4560
4561   detect_info->checked |= CATEGORY_MASK_SJIS;
4562   /* A coding system of this category is always ASCII compatible.  */
4563   src += coding->head_ascii;
4564
4565   while (1)
4566     {
4567       src_base = src;
4568       ONE_MORE_BYTE (c);
4569       if (c < 0x80)
4570         continue;
4571       if ((c >= 0x81 && c <= 0x9F)
4572           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4573         {
4574           ONE_MORE_BYTE (c);
4575           if (c < 0x40 || c == 0x7F || c > 0xFC)
4576             break;
4577           found = CATEGORY_MASK_SJIS;
4578         }
4579       else if (c >= 0xA0 && c < 0xE0)
4580         found = CATEGORY_MASK_SJIS;
4581       else
4582         break;
4583     }
4584   detect_info->rejected |= CATEGORY_MASK_SJIS;
4585   return 0;
4586
4587  no_more_source:
4588   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4589     {
4590       detect_info->rejected |= CATEGORY_MASK_SJIS;
4591       return 0;
4592     }
4593   detect_info->found |= found;
4594   return 1;
4595 }
4596
4597 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4598    Check if a text is encoded in BIG5.  If it is, return
4599    CATEGORY_MASK_BIG5, else return 0.  */
4600
4601 static int
4602 detect_coding_big5 (struct coding_system *coding,
4603                     struct coding_detection_info *detect_info)
4604 {
4605   const unsigned char *src = coding->source, *src_base;
4606   const unsigned char *src_end = coding->source + coding->src_bytes;
4607   int multibytep = coding->src_multibyte;
4608   int consumed_chars = 0;
4609   int found = 0;
4610   int c;
4611
4612   detect_info->checked |= CATEGORY_MASK_BIG5;
4613   /* A coding system of this category is always ASCII compatible.  */
4614   src += coding->head_ascii;
4615
4616   while (1)
4617     {
4618       src_base = src;
4619       ONE_MORE_BYTE (c);
4620       if (c < 0x80)
4621         continue;
4622       if (c >= 0xA1)
4623         {
4624           ONE_MORE_BYTE (c);
4625           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4626             return 0;
4627           found = CATEGORY_MASK_BIG5;
4628         }
4629       else
4630         break;
4631     }
4632   detect_info->rejected |= CATEGORY_MASK_BIG5;
4633   return 0;
4634
4635  no_more_source:
4636   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4637     {
4638       detect_info->rejected |= CATEGORY_MASK_BIG5;
4639       return 0;
4640     }
4641   detect_info->found |= found;
4642   return 1;
4643 }
4644
4645 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4646    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4647
4648 static void
4649 decode_coding_sjis (struct coding_system *coding)
4650 {
4651   const unsigned char *src = coding->source + coding->consumed;
4652   const unsigned char *src_end = coding->source + coding->src_bytes;
4653   const unsigned char *src_base;
4654   int *charbuf = coding->charbuf + coding->charbuf_used;
4655   /* We may produce one charset annotation in one loop and one more at
4656      the end.  */
4657   int *charbuf_end
4658     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4659   int consumed_chars = 0, consumed_chars_base;
4660   int multibytep = coding->src_multibyte;
4661   struct charset *charset_roman, *charset_kanji, *charset_kana;
4662   struct charset *charset_kanji2;
4663   Lisp_Object attrs, charset_list, val;
4664   int char_offset = coding->produced_char;
4665   int last_offset = char_offset;
4666   int last_id = charset_ascii;
4667   int eol_dos =
4668     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4669   int byte_after_cr = -1;
4670
4671   CODING_GET_INFO (coding, attrs, charset_list);
4672
4673   val = charset_list;
4674   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4675   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4676   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4677   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4678
4679   while (1)
4680     {
4681       int c, c1;
4682       struct charset *charset;
4683
4684       src_base = src;
4685       consumed_chars_base = consumed_chars;
4686
4687       if (charbuf >= charbuf_end)
4688         {
4689           if (byte_after_cr >= 0)
4690             src_base--;
4691           break;
4692         }
4693
4694       if (byte_after_cr >= 0)
4695         c = byte_after_cr, byte_after_cr = -1;
4696       else
4697         ONE_MORE_BYTE (c);
4698       if (c < 0)
4699         goto invalid_code;
4700       if (c < 0x80)
4701         {
4702           if (eol_dos && c == '\r')
4703             ONE_MORE_BYTE (byte_after_cr);
4704           charset = charset_roman;
4705         }
4706       else if (c == 0x80 || c == 0xA0)
4707         goto invalid_code;
4708       else if (c >= 0xA1 && c <= 0xDF)
4709         {
4710           /* SJIS -> JISX0201-Kana */
4711           c &= 0x7F;
4712           charset = charset_kana;
4713         }
4714       else if (c <= 0xEF)
4715         {
4716           /* SJIS -> JISX0208 */
4717           ONE_MORE_BYTE (c1);
4718           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4719             goto invalid_code;
4720           c = (c << 8) | c1;
4721           SJIS_TO_JIS (c);
4722           charset = charset_kanji;
4723         }
4724       else if (c <= 0xFC && charset_kanji2)
4725         {
4726           /* SJIS -> JISX0213-2 */
4727           ONE_MORE_BYTE (c1);
4728           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4729             goto invalid_code;
4730           c = (c << 8) | c1;
4731           SJIS_TO_JIS2 (c);
4732           charset = charset_kanji2;
4733         }
4734       else
4735         goto invalid_code;
4736       if (charset->id != charset_ascii
4737           && last_id != charset->id)
4738         {
4739           if (last_id != charset_ascii)
4740             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4741           last_id = charset->id;
4742           last_offset = char_offset;
4743         }
4744       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4745       *charbuf++ = c;
4746       char_offset++;
4747       continue;
4748
4749     invalid_code:
4750       src = src_base;
4751       consumed_chars = consumed_chars_base;
4752       ONE_MORE_BYTE (c);
4753       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4754       char_offset++;
4755       coding->errors++;
4756     }
4757
4758  no_more_source:
4759   if (last_id != charset_ascii)
4760     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4761   coding->consumed_char += consumed_chars_base;
4762   coding->consumed = src_base - coding->source;
4763   coding->charbuf_used = charbuf - coding->charbuf;
4764 }
4765
4766 static void
4767 decode_coding_big5 (struct coding_system *coding)
4768 {
4769   const unsigned char *src = coding->source + coding->consumed;
4770   const unsigned char *src_end = coding->source + coding->src_bytes;
4771   const unsigned char *src_base;
4772   int *charbuf = coding->charbuf + coding->charbuf_used;
4773   /* We may produce one charset annotation in one loop and one more at
4774      the end.  */
4775   int *charbuf_end
4776     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4777   int consumed_chars = 0, consumed_chars_base;
4778   int multibytep = coding->src_multibyte;
4779   struct charset *charset_roman, *charset_big5;
4780   Lisp_Object attrs, charset_list, val;
4781   int char_offset = coding->produced_char;
4782   int last_offset = char_offset;
4783   int last_id = charset_ascii;
4784   int eol_dos =
4785     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4786   int byte_after_cr = -1;
4787
4788   CODING_GET_INFO (coding, attrs, charset_list);
4789   val = charset_list;
4790   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4791   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4792
4793   while (1)
4794     {
4795       int c, c1;
4796       struct charset *charset;
4797
4798       src_base = src;
4799       consumed_chars_base = consumed_chars;
4800
4801       if (charbuf >= charbuf_end)
4802         {
4803           if (byte_after_cr >= 0)
4804             src_base--;
4805           break;
4806         }
4807
4808       if (byte_after_cr >= 0)
4809         c = byte_after_cr, byte_after_cr = -1;
4810       else
4811         ONE_MORE_BYTE (c);
4812
4813       if (c < 0)
4814         goto invalid_code;
4815       if (c < 0x80)
4816         {
4817           if (eol_dos && c == '\r')
4818             ONE_MORE_BYTE (byte_after_cr);
4819           charset = charset_roman;
4820         }
4821       else
4822         {
4823           /* BIG5 -> Big5 */
4824           if (c < 0xA1 || c > 0xFE)
4825             goto invalid_code;
4826           ONE_MORE_BYTE (c1);
4827           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4828             goto invalid_code;
4829           c = c << 8 | c1;
4830           charset = charset_big5;
4831         }
4832       if (charset->id != charset_ascii
4833           && last_id != charset->id)
4834         {
4835           if (last_id != charset_ascii)
4836             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4837           last_id = charset->id;
4838           last_offset = char_offset;
4839         }
4840       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4841       *charbuf++ = c;
4842       char_offset++;
4843       continue;
4844
4845     invalid_code:
4846       src = src_base;
4847       consumed_chars = consumed_chars_base;
4848       ONE_MORE_BYTE (c);
4849       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4850       char_offset++;
4851       coding->errors++;
4852     }
4853
4854  no_more_source:
4855   if (last_id != charset_ascii)
4856     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4857   coding->consumed_char += consumed_chars_base;
4858   coding->consumed = src_base - coding->source;
4859   coding->charbuf_used = charbuf - coding->charbuf;
4860 }
4861
4862 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4863    This function can encode charsets `ascii', `katakana-jisx0201',
4864    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4865    are sure that all these charsets are registered as official charset
4866    (i.e. do not have extended leading-codes).  Characters of other
4867    charsets are produced without any encoding.  If SJIS_P is 1, encode
4868    SJIS text, else encode BIG5 text.  */
4869
4870 static int
4871 encode_coding_sjis (struct coding_system *coding)
4872 {
4873   int multibytep = coding->dst_multibyte;
4874   int *charbuf = coding->charbuf;
4875   int *charbuf_end = charbuf + coding->charbuf_used;
4876   unsigned char *dst = coding->destination + coding->produced;
4877   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4878   int safe_room = 4;
4879   int produced_chars = 0;
4880   Lisp_Object attrs, charset_list, val;
4881   int ascii_compatible;
4882   struct charset *charset_roman, *charset_kanji, *charset_kana;
4883   struct charset *charset_kanji2;
4884   int c;
4885
4886   CODING_GET_INFO (coding, attrs, charset_list);
4887   val = charset_list;
4888   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4889   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4890   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4891   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4892
4893   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4894
4895   while (charbuf < charbuf_end)
4896     {
4897       ASSURE_DESTINATION (safe_room);
4898       c = *charbuf++;
4899       /* Now encode the character C.  */
4900       if (ASCII_CHAR_P (c) && ascii_compatible)
4901         EMIT_ONE_ASCII_BYTE (c);
4902       else if (CHAR_BYTE8_P (c))
4903         {
4904           c = CHAR_TO_BYTE8 (c);
4905           EMIT_ONE_BYTE (c);
4906         }
4907       else
4908         {
4909           unsigned code;
4910           struct charset *charset = char_charset (c, charset_list, &code);
4911
4912           if (!charset)
4913             {
4914               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4915                 {
4916                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4917                   charset = CHARSET_FROM_ID (charset_ascii);
4918                 }
4919               else
4920                 {
4921                   c = coding->default_char;
4922                   charset = char_charset (c, charset_list, &code);
4923                 }
4924             }
4925           if (code == CHARSET_INVALID_CODE (charset))
4926             abort ();
4927           if (charset == charset_kanji)
4928             {
4929               int c1, c2;
4930               JIS_TO_SJIS (code);
4931               c1 = code >> 8, c2 = code & 0xFF;
4932               EMIT_TWO_BYTES (c1, c2);
4933             }
4934           else if (charset == charset_kana)
4935             EMIT_ONE_BYTE (code | 0x80);
4936           else if (charset_kanji2 && charset == charset_kanji2)
4937             {
4938               int c1, c2;
4939
4940               c1 = code >> 8;
4941               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4942                   || c1 == 0x28
4943                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4944                 {
4945                   JIS_TO_SJIS2 (code);
4946                   c1 = code >> 8, c2 = code & 0xFF;
4947                   EMIT_TWO_BYTES (c1, c2);
4948                 }
4949               else
4950                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4951             }
4952           else
4953             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4954         }
4955     }
4956   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4957   coding->produced_char += produced_chars;
4958   coding->produced = dst - coding->destination;
4959   return 0;
4960 }
4961
4962 static int
4963 encode_coding_big5 (struct coding_system *coding)
4964 {
4965   int multibytep = coding->dst_multibyte;
4966   int *charbuf = coding->charbuf;
4967   int *charbuf_end = charbuf + coding->charbuf_used;
4968   unsigned char *dst = coding->destination + coding->produced;
4969   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4970   int safe_room = 4;
4971   int produced_chars = 0;
4972   Lisp_Object attrs, charset_list, val;
4973   int ascii_compatible;
4974   struct charset *charset_roman, *charset_big5;
4975   int c;
4976
4977   CODING_GET_INFO (coding, attrs, charset_list);
4978   val = charset_list;
4979   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4980   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4981   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4982
4983   while (charbuf < charbuf_end)
4984     {
4985       ASSURE_DESTINATION (safe_room);
4986       c = *charbuf++;
4987       /* Now encode the character C.  */
4988       if (ASCII_CHAR_P (c) && ascii_compatible)
4989         EMIT_ONE_ASCII_BYTE (c);
4990       else if (CHAR_BYTE8_P (c))
4991         {
4992           c = CHAR_TO_BYTE8 (c);
4993           EMIT_ONE_BYTE (c);
4994         }
4995       else
4996         {
4997           unsigned code;
4998           struct charset *charset = char_charset (c, charset_list, &code);
4999
5000           if (! charset)
5001             {
5002               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5003                 {
5004                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5005                   charset = CHARSET_FROM_ID (charset_ascii);
5006                 }
5007               else
5008                 {
5009                   c = coding->default_char;
5010                   charset = char_charset (c, charset_list, &code);
5011                 }
5012             }
5013           if (code == CHARSET_INVALID_CODE (charset))
5014             abort ();
5015           if (charset == charset_big5)
5016             {
5017               int c1, c2;
5018
5019               c1 = code >> 8, c2 = code & 0xFF;
5020               EMIT_TWO_BYTES (c1, c2);
5021             }
5022           else
5023             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5024         }
5025     }
5026   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5027   coding->produced_char += produced_chars;
5028   coding->produced = dst - coding->destination;
5029   return 0;
5030 }
5031
5032 \f
5033 /*** 10. CCL handlers ***/
5034
5035 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5036    Check if a text is encoded in a coding system of which
5037    encoder/decoder are written in CCL program.  If it is, return
5038    CATEGORY_MASK_CCL, else return 0.  */
5039
5040 static int
5041 detect_coding_ccl (struct coding_system *coding,
5042                    struct coding_detection_info *detect_info)
5043 {
5044   const unsigned char *src = coding->source, *src_base;
5045   const unsigned char *src_end = coding->source + coding->src_bytes;
5046   int multibytep = coding->src_multibyte;
5047   int consumed_chars = 0;
5048   int found = 0;
5049   unsigned char *valids;
5050   int head_ascii = coding->head_ascii;
5051   Lisp_Object attrs;
5052
5053   detect_info->checked |= CATEGORY_MASK_CCL;
5054
5055   coding = &coding_categories[coding_category_ccl];
5056   valids = CODING_CCL_VALIDS (coding);
5057   attrs = CODING_ID_ATTRS (coding->id);
5058   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5059     src += head_ascii;
5060
5061   while (1)
5062     {
5063       int c;
5064
5065       src_base = src;
5066       ONE_MORE_BYTE (c);
5067       if (c < 0 || ! valids[c])
5068         break;
5069       if ((valids[c] > 1))
5070         found = CATEGORY_MASK_CCL;
5071     }
5072   detect_info->rejected |= CATEGORY_MASK_CCL;
5073   return 0;
5074
5075  no_more_source:
5076   detect_info->found |= found;
5077   return 1;
5078 }
5079
5080 static void
5081 decode_coding_ccl (struct coding_system *coding)
5082 {
5083   const unsigned char *src = coding->source + coding->consumed;
5084   const unsigned char *src_end = coding->source + coding->src_bytes;
5085   int *charbuf = coding->charbuf + coding->charbuf_used;
5086   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5087   int consumed_chars = 0;
5088   int multibytep = coding->src_multibyte;
5089   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5090   int source_charbuf[1024];
5091   int source_byteidx[1025];
5092   Lisp_Object attrs, charset_list;
5093
5094   CODING_GET_INFO (coding, attrs, charset_list);
5095
5096   while (1)
5097     {
5098       const unsigned char *p = src;
5099       int i = 0;
5100
5101       if (multibytep)
5102         {
5103           while (i < 1024 && p < src_end)
5104             {
5105               source_byteidx[i] = p - src;
5106               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5107             }
5108           source_byteidx[i] = p - src;
5109         }
5110       else
5111         while (i < 1024 && p < src_end)
5112           source_charbuf[i++] = *p++;
5113
5114       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5115         ccl->last_block = 1;
5116       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5117                   charset_list);
5118       charbuf += ccl->produced;
5119       if (multibytep)
5120         src += source_byteidx[ccl->consumed];
5121       else
5122         src += ccl->consumed;
5123       consumed_chars += ccl->consumed;
5124       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5125         break;
5126     }
5127
5128   switch (ccl->status)
5129     {
5130     case CCL_STAT_SUSPEND_BY_SRC:
5131       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5132       break;
5133     case CCL_STAT_SUSPEND_BY_DST:
5134       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5135       break;
5136     case CCL_STAT_QUIT:
5137     case CCL_STAT_INVALID_CMD:
5138       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5139       break;
5140     default:
5141       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5142       break;
5143     }
5144   coding->consumed_char += consumed_chars;
5145   coding->consumed = src - coding->source;
5146   coding->charbuf_used = charbuf - coding->charbuf;
5147 }
5148
5149 static int
5150 encode_coding_ccl (struct coding_system *coding)
5151 {
5152   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5153   int multibytep = coding->dst_multibyte;
5154   int *charbuf = coding->charbuf;
5155   int *charbuf_end = charbuf + coding->charbuf_used;
5156   unsigned char *dst = coding->destination + coding->produced;
5157   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5158   int destination_charbuf[1024];
5159   int i, produced_chars = 0;
5160   Lisp_Object attrs, charset_list;
5161
5162   CODING_GET_INFO (coding, attrs, charset_list);
5163   if (coding->consumed_char == coding->src_chars
5164       && coding->mode & CODING_MODE_LAST_BLOCK)
5165     ccl->last_block = 1;
5166
5167   while (charbuf < charbuf_end)
5168     {
5169       ccl_driver (ccl, charbuf, destination_charbuf,
5170                   charbuf_end - charbuf, 1024, charset_list);
5171       if (multibytep)
5172         {
5173           ASSURE_DESTINATION (ccl->produced * 2);
5174           for (i = 0; i < ccl->produced; i++)
5175             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5176         }
5177       else
5178         {
5179           ASSURE_DESTINATION (ccl->produced);
5180           for (i = 0; i < ccl->produced; i++)
5181             *dst++ = destination_charbuf[i] & 0xFF;
5182           produced_chars += ccl->produced;
5183         }
5184       charbuf += ccl->consumed;
5185       if (ccl->status == CCL_STAT_QUIT
5186           || ccl->status == CCL_STAT_INVALID_CMD)
5187         break;
5188     }
5189
5190   switch (ccl->status)
5191     {
5192     case CCL_STAT_SUSPEND_BY_SRC:
5193       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5194       break;
5195     case CCL_STAT_SUSPEND_BY_DST:
5196       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5197       break;
5198     case CCL_STAT_QUIT:
5199     case CCL_STAT_INVALID_CMD:
5200       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5201       break;
5202     default:
5203       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5204       break;
5205     }
5206
5207   coding->produced_char += produced_chars;
5208   coding->produced = dst - coding->destination;
5209   return 0;
5210 }
5211
5212
5213 \f
5214 /*** 10, 11. no-conversion handlers ***/
5215
5216 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5217
5218 static void
5219 decode_coding_raw_text (struct coding_system *coding)
5220 {
5221   int eol_dos =
5222     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5223
5224   coding->chars_at_source = 1;
5225   coding->consumed_char = coding->src_chars;
5226   coding->consumed = coding->src_bytes;
5227   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5228     {
5229       coding->consumed_char--;
5230       coding->consumed--;
5231       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5232     }
5233   else
5234     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5235 }
5236
5237 static int
5238 encode_coding_raw_text (struct coding_system *coding)
5239 {
5240   int multibytep = coding->dst_multibyte;
5241   int *charbuf = coding->charbuf;
5242   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5243   unsigned char *dst = coding->destination + coding->produced;
5244   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5245   int produced_chars = 0;
5246   int c;
5247
5248   if (multibytep)
5249     {
5250       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5251
5252       if (coding->src_multibyte)
5253         while (charbuf < charbuf_end)
5254           {
5255             ASSURE_DESTINATION (safe_room);
5256             c = *charbuf++;
5257             if (ASCII_CHAR_P (c))
5258               EMIT_ONE_ASCII_BYTE (c);
5259             else if (CHAR_BYTE8_P (c))
5260               {
5261                 c = CHAR_TO_BYTE8 (c);
5262                 EMIT_ONE_BYTE (c);
5263               }
5264             else
5265               {
5266                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5267
5268                 CHAR_STRING_ADVANCE (c, p1);
5269                 do
5270                   {
5271                     EMIT_ONE_BYTE (*p0);
5272                     p0++;
5273                   }
5274                 while (p0 < p1);
5275               }
5276           }
5277       else
5278         while (charbuf < charbuf_end)
5279           {
5280             ASSURE_DESTINATION (safe_room);
5281             c = *charbuf++;
5282             EMIT_ONE_BYTE (c);
5283           }
5284     }
5285   else
5286     {
5287       if (coding->src_multibyte)
5288         {
5289           int safe_room = MAX_MULTIBYTE_LENGTH;
5290
5291           while (charbuf < charbuf_end)
5292             {
5293               ASSURE_DESTINATION (safe_room);
5294               c = *charbuf++;
5295               if (ASCII_CHAR_P (c))
5296                 *dst++ = c;
5297               else if (CHAR_BYTE8_P (c))
5298                 *dst++ = CHAR_TO_BYTE8 (c);
5299               else
5300                 CHAR_STRING_ADVANCE (c, dst);
5301             }
5302         }
5303       else
5304         {
5305           ASSURE_DESTINATION (charbuf_end - charbuf);
5306           while (charbuf < charbuf_end && dst < dst_end)
5307             *dst++ = *charbuf++;
5308         }
5309       produced_chars = dst - (coding->destination + coding->produced);
5310     }
5311   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5312   coding->produced_char += produced_chars;
5313   coding->produced = dst - coding->destination;
5314   return 0;
5315 }
5316
5317 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5318    Check if a text is encoded in a charset-based coding system.  If it
5319    is, return 1, else return 0.  */
5320
5321 static int
5322 detect_coding_charset (struct coding_system *coding,
5323                        struct coding_detection_info *detect_info)
5324 {
5325   const unsigned char *src = coding->source, *src_base;
5326   const unsigned char *src_end = coding->source + coding->src_bytes;
5327   int multibytep = coding->src_multibyte;
5328   int consumed_chars = 0;
5329   Lisp_Object attrs, valids, name;
5330   int found = 0;
5331   int head_ascii = coding->head_ascii;
5332   int check_latin_extra = 0;
5333
5334   detect_info->checked |= CATEGORY_MASK_CHARSET;
5335
5336   coding = &coding_categories[coding_category_charset];
5337   attrs = CODING_ID_ATTRS (coding->id);
5338   valids = AREF (attrs, coding_attr_charset_valids);
5339   name = CODING_ID_NAME (coding->id);
5340   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5341                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5342       || strncmp (SSDATA (SYMBOL_NAME (name)),
5343                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5344     check_latin_extra = 1;
5345
5346   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5347     src += head_ascii;
5348
5349   while (1)
5350     {
5351       int c;
5352       Lisp_Object val;
5353       struct charset *charset;
5354       int dim, idx;
5355
5356       src_base = src;
5357       ONE_MORE_BYTE (c);
5358       if (c < 0)
5359         continue;
5360       val = AREF (valids, c);
5361       if (NILP (val))
5362         break;
5363       if (c >= 0x80)
5364         {
5365           if (c < 0xA0
5366               && check_latin_extra
5367               && (!VECTORP (Vlatin_extra_code_table)
5368                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5369             break;
5370           found = CATEGORY_MASK_CHARSET;
5371         }
5372       if (INTEGERP (val))
5373         {
5374           charset = CHARSET_FROM_ID (XFASTINT (val));
5375           dim = CHARSET_DIMENSION (charset);
5376           for (idx = 1; idx < dim; idx++)
5377             {
5378               if (src == src_end)
5379                 goto too_short;
5380               ONE_MORE_BYTE (c);
5381               if (c < charset->code_space[(dim - 1 - idx) * 2]
5382                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5383                 break;
5384             }
5385           if (idx < dim)
5386             break;
5387         }
5388       else
5389         {
5390           idx = 1;
5391           for (; CONSP (val); val = XCDR (val))
5392             {
5393               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5394               dim = CHARSET_DIMENSION (charset);
5395               while (idx < dim)
5396                 {
5397                   if (src == src_end)
5398                     goto too_short;
5399                   ONE_MORE_BYTE (c);
5400                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5401                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5402                     break;
5403                   idx++;
5404                 }
5405               if (idx == dim)
5406                 {
5407                   val = Qnil;
5408                   break;
5409                 }
5410             }
5411           if (CONSP (val))
5412             break;
5413         }
5414     }
5415  too_short:
5416   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5417   return 0;
5418
5419  no_more_source:
5420   detect_info->found |= found;
5421   return 1;
5422 }
5423
5424 static void
5425 decode_coding_charset (struct coding_system *coding)
5426 {
5427   const unsigned char *src = coding->source + coding->consumed;
5428   const unsigned char *src_end = coding->source + coding->src_bytes;
5429   const unsigned char *src_base;
5430   int *charbuf = coding->charbuf + coding->charbuf_used;
5431   /* We may produce one charset annotation in one loop and one more at
5432      the end.  */
5433   int *charbuf_end
5434     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5435   int consumed_chars = 0, consumed_chars_base;
5436   int multibytep = coding->src_multibyte;
5437   Lisp_Object attrs, charset_list, valids;
5438   int char_offset = coding->produced_char;
5439   int last_offset = char_offset;
5440   int last_id = charset_ascii;
5441   int eol_dos =
5442     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5443   int byte_after_cr = -1;
5444
5445   CODING_GET_INFO (coding, attrs, charset_list);
5446   valids = AREF (attrs, coding_attr_charset_valids);
5447
5448   while (1)
5449     {
5450       int c;
5451       Lisp_Object val;
5452       struct charset *charset;
5453       int dim;
5454       int len = 1;
5455       unsigned code;
5456
5457       src_base = src;
5458       consumed_chars_base = consumed_chars;
5459
5460       if (charbuf >= charbuf_end)
5461         {
5462           if (byte_after_cr >= 0)
5463             src_base--;
5464           break;
5465         }
5466
5467       if (byte_after_cr >= 0)
5468         {
5469           c = byte_after_cr;
5470           byte_after_cr = -1;
5471         }
5472       else
5473         {
5474           ONE_MORE_BYTE (c);
5475           if (eol_dos && c == '\r')
5476             ONE_MORE_BYTE (byte_after_cr);
5477         }
5478       if (c < 0)
5479         goto invalid_code;
5480       code = c;
5481
5482       val = AREF (valids, c);
5483       if (! INTEGERP (val) && ! CONSP (val))
5484         goto invalid_code;
5485       if (INTEGERP (val))
5486         {
5487           charset = CHARSET_FROM_ID (XFASTINT (val));
5488           dim = CHARSET_DIMENSION (charset);
5489           while (len < dim)
5490             {
5491               ONE_MORE_BYTE (c);
5492               code = (code << 8) | c;
5493               len++;
5494             }
5495           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5496                               charset, code, c);
5497         }
5498       else
5499         {
5500           /* VAL is a list of charset IDs.  It is assured that the
5501              list is sorted by charset dimensions (smaller one
5502              comes first).  */
5503           while (CONSP (val))
5504             {
5505               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5506               dim = CHARSET_DIMENSION (charset);
5507               while (len < dim)
5508                 {
5509                   ONE_MORE_BYTE (c);
5510                   code = (code << 8) | c;
5511                   len++;
5512                 }
5513               CODING_DECODE_CHAR (coding, src, src_base,
5514                                   src_end, charset, code, c);
5515               if (c >= 0)
5516                 break;
5517               val = XCDR (val);
5518             }
5519         }
5520       if (c < 0)
5521         goto invalid_code;
5522       if (charset->id != charset_ascii
5523           && last_id != charset->id)
5524         {
5525           if (last_id != charset_ascii)
5526             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5527           last_id = charset->id;
5528           last_offset = char_offset;
5529         }
5530
5531       *charbuf++ = c;
5532       char_offset++;
5533       continue;
5534
5535     invalid_code:
5536       src = src_base;
5537       consumed_chars = consumed_chars_base;
5538       ONE_MORE_BYTE (c);
5539       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5540       char_offset++;
5541       coding->errors++;
5542     }
5543
5544  no_more_source:
5545   if (last_id != charset_ascii)
5546     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5547   coding->consumed_char += consumed_chars_base;
5548   coding->consumed = src_base - coding->source;
5549   coding->charbuf_used = charbuf - coding->charbuf;
5550 }
5551
5552 static int
5553 encode_coding_charset (struct coding_system *coding)
5554 {
5555   int multibytep = coding->dst_multibyte;
5556   int *charbuf = coding->charbuf;
5557   int *charbuf_end = charbuf + coding->charbuf_used;
5558   unsigned char *dst = coding->destination + coding->produced;
5559   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5560   int safe_room = MAX_MULTIBYTE_LENGTH;
5561   int produced_chars = 0;
5562   Lisp_Object attrs, charset_list;
5563   int ascii_compatible;
5564   int c;
5565
5566   CODING_GET_INFO (coding, attrs, charset_list);
5567   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5568
5569   while (charbuf < charbuf_end)
5570     {
5571       struct charset *charset;
5572       unsigned code;
5573
5574       ASSURE_DESTINATION (safe_room);
5575       c = *charbuf++;
5576       if (ascii_compatible && ASCII_CHAR_P (c))
5577         EMIT_ONE_ASCII_BYTE (c);
5578       else if (CHAR_BYTE8_P (c))
5579         {
5580           c = CHAR_TO_BYTE8 (c);
5581           EMIT_ONE_BYTE (c);
5582         }
5583       else
5584         {
5585           charset = char_charset (c, charset_list, &code);
5586           if (charset)
5587             {
5588               if (CHARSET_DIMENSION (charset) == 1)
5589                 EMIT_ONE_BYTE (code);
5590               else if (CHARSET_DIMENSION (charset) == 2)
5591                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5592               else if (CHARSET_DIMENSION (charset) == 3)
5593                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5594               else
5595                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5596                                  (code >> 8) & 0xFF, code & 0xFF);
5597             }
5598           else
5599             {
5600               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5601                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5602               else
5603                 c = coding->default_char;
5604               EMIT_ONE_BYTE (c);
5605             }
5606         }
5607     }
5608
5609   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5610   coding->produced_char += produced_chars;
5611   coding->produced = dst - coding->destination;
5612   return 0;
5613 }
5614
5615 \f
5616 /*** 7. C library functions ***/
5617
5618 /* Setup coding context CODING from information about CODING_SYSTEM.
5619    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5620    CODING_SYSTEM is invalid, signal an error.  */
5621
5622 void
5623 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5624 {
5625   Lisp_Object attrs;
5626   Lisp_Object eol_type;
5627   Lisp_Object coding_type;
5628   Lisp_Object val;
5629
5630   if (NILP (coding_system))
5631     coding_system = Qundecided;
5632
5633   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5634
5635   attrs = CODING_ID_ATTRS (coding->id);
5636   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5637
5638   coding->mode = 0;
5639   coding->head_ascii = -1;
5640   if (VECTORP (eol_type))
5641     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5642                             | CODING_REQUIRE_DETECTION_MASK);
5643   else if (! EQ (eol_type, Qunix))
5644     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5645                             | CODING_REQUIRE_ENCODING_MASK);
5646   else
5647     coding->common_flags = 0;
5648   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5649     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5650   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5651     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5652   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5653     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5654
5655   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5656   coding->max_charset_id = SCHARS (val) - 1;
5657   coding->safe_charsets = SDATA (val);
5658   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5659   coding->carryover_bytes = 0;
5660
5661   coding_type = CODING_ATTR_TYPE (attrs);
5662   if (EQ (coding_type, Qundecided))
5663     {
5664       coding->detector = NULL;
5665       coding->decoder = decode_coding_raw_text;
5666       coding->encoder = encode_coding_raw_text;
5667       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5668     }
5669   else if (EQ (coding_type, Qiso_2022))
5670     {
5671       int i;
5672       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5673
5674       /* Invoke graphic register 0 to plane 0.  */
5675       CODING_ISO_INVOCATION (coding, 0) = 0;
5676       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5677       CODING_ISO_INVOCATION (coding, 1)
5678         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5679       /* Setup the initial status of designation.  */
5680       for (i = 0; i < 4; i++)
5681         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5682       /* Not single shifting initially.  */
5683       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5684       /* Beginning of buffer should also be regarded as bol. */
5685       CODING_ISO_BOL (coding) = 1;
5686       coding->detector = detect_coding_iso_2022;
5687       coding->decoder = decode_coding_iso_2022;
5688       coding->encoder = encode_coding_iso_2022;
5689       if (flags & CODING_ISO_FLAG_SAFE)
5690         coding->mode |= CODING_MODE_SAFE_ENCODING;
5691       coding->common_flags
5692         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5693             | CODING_REQUIRE_FLUSHING_MASK);
5694       if (flags & CODING_ISO_FLAG_COMPOSITION)
5695         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5696       if (flags & CODING_ISO_FLAG_DESIGNATION)
5697         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5698       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5699         {
5700           setup_iso_safe_charsets (attrs);
5701           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5702           coding->max_charset_id = SCHARS (val) - 1;
5703           coding->safe_charsets = SDATA (val);
5704         }
5705       CODING_ISO_FLAGS (coding) = flags;
5706       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5707       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5708       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5709       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5710     }
5711   else if (EQ (coding_type, Qcharset))
5712     {
5713       coding->detector = detect_coding_charset;
5714       coding->decoder = decode_coding_charset;
5715       coding->encoder = encode_coding_charset;
5716       coding->common_flags
5717         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5718     }
5719   else if (EQ (coding_type, Qutf_8))
5720     {
5721       val = AREF (attrs, coding_attr_utf_bom);
5722       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5723                                    : EQ (val, Qt) ? utf_with_bom
5724                                    : utf_without_bom);
5725       coding->detector = detect_coding_utf_8;
5726       coding->decoder = decode_coding_utf_8;
5727       coding->encoder = encode_coding_utf_8;
5728       coding->common_flags
5729         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5730       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5731         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5732     }
5733   else if (EQ (coding_type, Qutf_16))
5734     {
5735       val = AREF (attrs, coding_attr_utf_bom);
5736       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5737                                     : EQ (val, Qt) ? utf_with_bom
5738                                     : utf_without_bom);
5739       val = AREF (attrs, coding_attr_utf_16_endian);
5740       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5741                                        : utf_16_little_endian);
5742       CODING_UTF_16_SURROGATE (coding) = 0;
5743       coding->detector = detect_coding_utf_16;
5744       coding->decoder = decode_coding_utf_16;
5745       coding->encoder = encode_coding_utf_16;
5746       coding->common_flags
5747         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5748       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5749         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5750     }
5751   else if (EQ (coding_type, Qccl))
5752     {
5753       coding->detector = detect_coding_ccl;
5754       coding->decoder = decode_coding_ccl;
5755       coding->encoder = encode_coding_ccl;
5756       coding->common_flags
5757         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5758             | CODING_REQUIRE_FLUSHING_MASK);
5759     }
5760   else if (EQ (coding_type, Qemacs_mule))
5761     {
5762       coding->detector = detect_coding_emacs_mule;
5763       coding->decoder = decode_coding_emacs_mule;
5764       coding->encoder = encode_coding_emacs_mule;
5765       coding->common_flags
5766         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5767       coding->spec.emacs_mule.full_support = 1;
5768       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5769           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5770         {
5771           Lisp_Object tail, safe_charsets;
5772           int max_charset_id = 0;
5773
5774           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5775                tail = XCDR (tail))
5776             if (max_charset_id < XFASTINT (XCAR (tail)))
5777               max_charset_id = XFASTINT (XCAR (tail));
5778           safe_charsets = make_uninit_string (max_charset_id + 1);
5779           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5780           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5781                tail = XCDR (tail))
5782             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5783           coding->max_charset_id = max_charset_id;
5784           coding->safe_charsets = SDATA (safe_charsets);
5785           coding->spec.emacs_mule.full_support = 1;
5786         }
5787       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5788       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5789     }
5790   else if (EQ (coding_type, Qshift_jis))
5791     {
5792       coding->detector = detect_coding_sjis;
5793       coding->decoder = decode_coding_sjis;
5794       coding->encoder = encode_coding_sjis;
5795       coding->common_flags
5796         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5797     }
5798   else if (EQ (coding_type, Qbig5))
5799     {
5800       coding->detector = detect_coding_big5;
5801       coding->decoder = decode_coding_big5;
5802       coding->encoder = encode_coding_big5;
5803       coding->common_flags
5804         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5805     }
5806   else                          /* EQ (coding_type, Qraw_text) */
5807     {
5808       coding->detector = NULL;
5809       coding->decoder = decode_coding_raw_text;
5810       coding->encoder = encode_coding_raw_text;
5811       if (! EQ (eol_type, Qunix))
5812         {
5813           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5814           if (! VECTORP (eol_type))
5815             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5816         }
5817
5818     }
5819
5820   return;
5821 }
5822
5823 /* Return a list of charsets supported by CODING.  */
5824
5825 Lisp_Object
5826 coding_charset_list (struct coding_system *coding)
5827 {
5828   Lisp_Object attrs, charset_list;
5829
5830   CODING_GET_INFO (coding, attrs, charset_list);
5831   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5832     {
5833       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5834
5835       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5836         charset_list = Viso_2022_charset_list;
5837     }
5838   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5839     {
5840       charset_list = Vemacs_mule_charset_list;
5841     }
5842   return charset_list;
5843 }
5844
5845
5846 /* Return a list of charsets supported by CODING-SYSTEM.  */
5847
5848 Lisp_Object
5849 coding_system_charset_list (Lisp_Object coding_system)
5850 {
5851   int id;
5852   Lisp_Object attrs, charset_list;
5853
5854   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5855   attrs = CODING_ID_ATTRS (id);
5856
5857   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5858     {
5859       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5860
5861       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5862         charset_list = Viso_2022_charset_list;
5863       else
5864         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5865     }
5866   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5867     {
5868       charset_list = Vemacs_mule_charset_list;
5869     }
5870   else
5871     {
5872       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5873     }
5874   return charset_list;
5875 }
5876
5877
5878 /* Return raw-text or one of its subsidiaries that has the same
5879    eol_type as CODING-SYSTEM.  */
5880
5881 Lisp_Object
5882 raw_text_coding_system (Lisp_Object coding_system)
5883 {
5884   Lisp_Object spec, attrs;
5885   Lisp_Object eol_type, raw_text_eol_type;
5886
5887   if (NILP (coding_system))
5888     return Qraw_text;
5889   spec = CODING_SYSTEM_SPEC (coding_system);
5890   attrs = AREF (spec, 0);
5891
5892   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5893     return coding_system;
5894
5895   eol_type = AREF (spec, 2);
5896   if (VECTORP (eol_type))
5897     return Qraw_text;
5898   spec = CODING_SYSTEM_SPEC (Qraw_text);
5899   raw_text_eol_type = AREF (spec, 2);
5900   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5901           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5902           : AREF (raw_text_eol_type, 2));
5903 }
5904
5905
5906 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5907    the subsidiary that has the same eol-spec as PARENT (if it is not
5908    nil and specifies end-of-line format) or the system's setting
5909    (system_eol_type).  */
5910
5911 Lisp_Object
5912 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5913 {
5914   Lisp_Object spec, eol_type;
5915
5916   if (NILP (coding_system))
5917     coding_system = Qraw_text;
5918   spec = CODING_SYSTEM_SPEC (coding_system);
5919   eol_type = AREF (spec, 2);
5920   if (VECTORP (eol_type))
5921     {
5922       Lisp_Object parent_eol_type;
5923
5924       if (! NILP (parent))
5925         {
5926           Lisp_Object parent_spec;
5927
5928           parent_spec = CODING_SYSTEM_SPEC (parent);
5929           parent_eol_type = AREF (parent_spec, 2);
5930           if (VECTORP (parent_eol_type))
5931             parent_eol_type = system_eol_type;
5932         }
5933       else
5934         parent_eol_type = system_eol_type;
5935       if (EQ (parent_eol_type, Qunix))
5936         coding_system = AREF (eol_type, 0);
5937       else if (EQ (parent_eol_type, Qdos))
5938         coding_system = AREF (eol_type, 1);
5939       else if (EQ (parent_eol_type, Qmac))
5940         coding_system = AREF (eol_type, 2);
5941     }
5942   return coding_system;
5943 }
5944
5945
5946 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5947    decided for writing to a process.  If not, complement them, and
5948    return a new coding system.  */
5949
5950 Lisp_Object
5951 complement_process_encoding_system (Lisp_Object coding_system)
5952 {
5953   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5954   Lisp_Object spec, attrs;
5955   int i;
5956
5957   for (i = 0; i < 3; i++)
5958     {
5959       if (i == 1)
5960         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5961       else if (i == 2)
5962         coding_system = preferred_coding_system ();
5963       spec = CODING_SYSTEM_SPEC (coding_system);
5964       if (NILP (spec))
5965         continue;
5966       attrs = AREF (spec, 0);
5967       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5968         coding_base = CODING_ATTR_BASE_NAME (attrs);
5969       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5970         eol_base = coding_system;
5971       if (! NILP (coding_base) && ! NILP (eol_base))
5972         break;
5973     }
5974
5975   if (i > 0)
5976     /* The original CODING_SYSTEM didn't specify text-conversion or
5977        eol-conversion.  Be sure that we return a fully complemented
5978        coding system.  */
5979     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5980   return coding_system;
5981 }
5982
5983
5984 /* Emacs has a mechanism to automatically detect a coding system if it
5985    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5986    it's impossible to distinguish some coding systems accurately
5987    because they use the same range of codes.  So, at first, coding
5988    systems are categorized into 7, those are:
5989
5990    o coding-category-emacs-mule
5991
5992         The category for a coding system which has the same code range
5993         as Emacs' internal format.  Assigned the coding-system (Lisp
5994         symbol) `emacs-mule' by default.
5995
5996    o coding-category-sjis
5997
5998         The category for a coding system which has the same code range
5999         as SJIS.  Assigned the coding-system (Lisp
6000         symbol) `japanese-shift-jis' by default.
6001
6002    o coding-category-iso-7
6003
6004         The category for a coding system which has the same code range
6005         as ISO2022 of 7-bit environment.  This doesn't use any locking
6006         shift and single shift functions.  This can encode/decode all
6007         charsets.  Assigned the coding-system (Lisp symbol)
6008         `iso-2022-7bit' by default.
6009
6010    o coding-category-iso-7-tight
6011
6012         Same as coding-category-iso-7 except that this can
6013         encode/decode only the specified charsets.
6014
6015    o coding-category-iso-8-1
6016
6017         The category for a coding system which has the same code range
6018         as ISO2022 of 8-bit environment and graphic plane 1 used only
6019         for DIMENSION1 charset.  This doesn't use any locking shift
6020         and single shift functions.  Assigned the coding-system (Lisp
6021         symbol) `iso-latin-1' by default.
6022
6023    o coding-category-iso-8-2
6024
6025         The category for a coding system which has the same code range
6026         as ISO2022 of 8-bit environment and graphic plane 1 used only
6027         for DIMENSION2 charset.  This doesn't use any locking shift
6028         and single shift functions.  Assigned the coding-system (Lisp
6029         symbol) `japanese-iso-8bit' by default.
6030
6031    o coding-category-iso-7-else
6032
6033         The category for a coding system which has the same code range
6034         as ISO2022 of 7-bit environment but uses locking shift or
6035         single shift functions.  Assigned the coding-system (Lisp
6036         symbol) `iso-2022-7bit-lock' by default.
6037
6038    o coding-category-iso-8-else
6039
6040         The category for a coding system which has the same code range
6041         as ISO2022 of 8-bit environment but uses locking shift or
6042         single shift functions.  Assigned the coding-system (Lisp
6043         symbol) `iso-2022-8bit-ss2' by default.
6044
6045    o coding-category-big5
6046
6047         The category for a coding system which has the same code range
6048         as BIG5.  Assigned the coding-system (Lisp symbol)
6049         `cn-big5' by default.
6050
6051    o coding-category-utf-8
6052
6053         The category for a coding system which has the same code range
6054         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6055         symbol) `utf-8' by default.
6056
6057    o coding-category-utf-16-be
6058
6059         The category for a coding system in which a text has an
6060         Unicode signature (cf. Unicode Standard) in the order of BIG
6061         endian at the head.  Assigned the coding-system (Lisp symbol)
6062         `utf-16-be' by default.
6063
6064    o coding-category-utf-16-le
6065
6066         The category for a coding system in which a text has an
6067         Unicode signature (cf. Unicode Standard) in the order of
6068         LITTLE endian at the head.  Assigned the coding-system (Lisp
6069         symbol) `utf-16-le' by default.
6070
6071    o coding-category-ccl
6072
6073         The category for a coding system of which encoder/decoder is
6074         written in CCL programs.  The default value is nil, i.e., no
6075         coding system is assigned.
6076
6077    o coding-category-binary
6078
6079         The category for a coding system not categorized in any of the
6080         above.  Assigned the coding-system (Lisp symbol)
6081         `no-conversion' by default.
6082
6083    Each of them is a Lisp symbol and the value is an actual
6084    `coding-system's (this is also a Lisp symbol) assigned by a user.
6085    What Emacs does actually is to detect a category of coding system.
6086    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6087    decide only one possible category, it selects a category of the
6088    highest priority.  Priorities of categories are also specified by a
6089    user in a Lisp variable `coding-category-list'.
6090
6091 */
6092
6093 #define EOL_SEEN_NONE   0
6094 #define EOL_SEEN_LF     1
6095 #define EOL_SEEN_CR     2
6096 #define EOL_SEEN_CRLF   4
6097
6098 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6099    SOURCE is encoded.  If CATEGORY is one of
6100    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6101    two-byte, else they are encoded by one-byte.
6102
6103    Return one of EOL_SEEN_XXX.  */
6104
6105 #define MAX_EOL_CHECK_COUNT 3
6106
6107 static int
6108 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6109             enum coding_category category)
6110 {
6111   const unsigned char *src = source, *src_end = src + src_bytes;
6112   unsigned char c;
6113   int total  = 0;
6114   int eol_seen = EOL_SEEN_NONE;
6115
6116   if ((1 << category) & CATEGORY_MASK_UTF_16)
6117     {
6118       int msb, lsb;
6119
6120       msb = category == (coding_category_utf_16_le
6121                          | coding_category_utf_16_le_nosig);
6122       lsb = 1 - msb;
6123
6124       while (src + 1 < src_end)
6125         {
6126           c = src[lsb];
6127           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6128             {
6129               int this_eol;
6130
6131               if (c == '\n')
6132                 this_eol = EOL_SEEN_LF;
6133               else if (src + 3 >= src_end
6134                        || src[msb + 2] != 0
6135                        || src[lsb + 2] != '\n')
6136                 this_eol = EOL_SEEN_CR;
6137               else
6138                 {
6139                   this_eol = EOL_SEEN_CRLF;
6140                   src += 2;
6141                 }
6142
6143               if (eol_seen == EOL_SEEN_NONE)
6144                 /* This is the first end-of-line.  */
6145                 eol_seen = this_eol;
6146               else if (eol_seen != this_eol)
6147                 {
6148                   /* The found type is different from what found before.
6149                      Allow for stray ^M characters in DOS EOL files.  */
6150                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6151                       || (eol_seen == EOL_SEEN_CRLF
6152                           && this_eol == EOL_SEEN_CR))
6153                     eol_seen = EOL_SEEN_CRLF;
6154                   else
6155                     {
6156                       eol_seen = EOL_SEEN_LF;
6157                       break;
6158                     }
6159                 }
6160               if (++total == MAX_EOL_CHECK_COUNT)
6161                 break;
6162             }
6163           src += 2;
6164         }
6165     }
6166   else
6167     while (src < src_end)
6168       {
6169         c = *src++;
6170         if (c == '\n' || c == '\r')
6171           {
6172             int this_eol;
6173
6174             if (c == '\n')
6175               this_eol = EOL_SEEN_LF;
6176             else if (src >= src_end || *src != '\n')
6177               this_eol = EOL_SEEN_CR;
6178             else
6179               this_eol = EOL_SEEN_CRLF, src++;
6180
6181             if (eol_seen == EOL_SEEN_NONE)
6182               /* This is the first end-of-line.  */
6183               eol_seen = this_eol;
6184             else if (eol_seen != this_eol)
6185               {
6186                 /* The found type is different from what found before.
6187                    Allow for stray ^M characters in DOS EOL files.  */
6188                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6189                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6190                   eol_seen = EOL_SEEN_CRLF;
6191                 else
6192                   {
6193                     eol_seen = EOL_SEEN_LF;
6194                     break;
6195                   }
6196               }
6197             if (++total == MAX_EOL_CHECK_COUNT)
6198               break;
6199           }
6200       }
6201   return eol_seen;
6202 }
6203
6204
6205 static Lisp_Object
6206 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6207 {
6208   Lisp_Object eol_type;
6209
6210   eol_type = CODING_ID_EOL_TYPE (coding->id);
6211   if (eol_seen & EOL_SEEN_LF)
6212     {
6213       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6214       eol_type = Qunix;
6215     }
6216   else if (eol_seen & EOL_SEEN_CRLF)
6217     {
6218       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6219       eol_type = Qdos;
6220     }
6221   else if (eol_seen & EOL_SEEN_CR)
6222     {
6223       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6224       eol_type = Qmac;
6225     }
6226   return eol_type;
6227 }
6228
6229 /* Detect how a text specified in CODING is encoded.  If a coding
6230    system is detected, update fields of CODING by the detected coding
6231    system.  */
6232
6233 void
6234 detect_coding (struct coding_system *coding)
6235 {
6236   const unsigned char *src, *src_end;
6237   int saved_mode = coding->mode;
6238
6239   coding->consumed = coding->consumed_char = 0;
6240   coding->produced = coding->produced_char = 0;
6241   coding_set_source (coding);
6242
6243   src_end = coding->source + coding->src_bytes;
6244   coding->head_ascii = 0;
6245
6246   /* If we have not yet decided the text encoding type, detect it
6247      now.  */
6248   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6249     {
6250       int c, i;
6251       struct coding_detection_info detect_info;
6252       int null_byte_found = 0, eight_bit_found = 0;
6253
6254       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6255       for (src = coding->source; src < src_end; src++)
6256         {
6257           c = *src;
6258           if (c & 0x80)
6259             {
6260               eight_bit_found = 1;
6261               if (null_byte_found)
6262                 break;
6263             }
6264           else if (c < 0x20)
6265             {
6266               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6267                   && ! inhibit_iso_escape_detection
6268                   && ! detect_info.checked)
6269                 {
6270                   if (detect_coding_iso_2022 (coding, &detect_info))
6271                     {
6272                       /* We have scanned the whole data.  */
6273                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6274                         {
6275                           /* We didn't find an 8-bit code.  We may
6276                              have found a null-byte, but it's very
6277                              rare that a binary file conforms to
6278                              ISO-2022.  */
6279                           src = src_end;
6280                           coding->head_ascii = src - coding->source;
6281                         }
6282                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6283                       break;
6284                     }
6285                 }
6286               else if (! c && !inhibit_null_byte_detection)
6287                 {
6288                   null_byte_found = 1;
6289                   if (eight_bit_found)
6290                     break;
6291                 }
6292               if (! eight_bit_found)
6293                 coding->head_ascii++;
6294             }
6295           else if (! eight_bit_found)
6296             coding->head_ascii++;
6297         }
6298
6299       if (null_byte_found || eight_bit_found
6300           || coding->head_ascii < coding->src_bytes
6301           || detect_info.found)
6302         {
6303           enum coding_category category;
6304           struct coding_system *this;
6305
6306           if (coding->head_ascii == coding->src_bytes)
6307             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6308             for (i = 0; i < coding_category_raw_text; i++)
6309               {
6310                 category = coding_priorities[i];
6311                 this = coding_categories + category;
6312                 if (detect_info.found & (1 << category))
6313                   break;
6314               }
6315           else
6316             {
6317               if (null_byte_found)
6318                 {
6319                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6320                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6321                 }
6322               for (i = 0; i < coding_category_raw_text; i++)
6323                 {
6324                   category = coding_priorities[i];
6325                   this = coding_categories + category;
6326                   if (this->id < 0)
6327                     {
6328                       /* No coding system of this category is defined.  */
6329                       detect_info.rejected |= (1 << category);
6330                     }
6331                   else if (category >= coding_category_raw_text)
6332                     continue;
6333                   else if (detect_info.checked & (1 << category))
6334                     {
6335                       if (detect_info.found & (1 << category))
6336                         break;
6337                     }
6338                   else if ((*(this->detector)) (coding, &detect_info)
6339                            && detect_info.found & (1 << category))
6340                     {
6341                       if (category == coding_category_utf_16_auto)
6342                         {
6343                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6344                             category = coding_category_utf_16_le;
6345                           else
6346                             category = coding_category_utf_16_be;
6347                         }
6348                       break;
6349                     }
6350                 }
6351             }
6352
6353           if (i < coding_category_raw_text)
6354             setup_coding_system (CODING_ID_NAME (this->id), coding);
6355           else if (null_byte_found)
6356             setup_coding_system (Qno_conversion, coding);
6357           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6358                    == CATEGORY_MASK_ANY)
6359             setup_coding_system (Qraw_text, coding);
6360           else if (detect_info.rejected)
6361             for (i = 0; i < coding_category_raw_text; i++)
6362               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6363                 {
6364                   this = coding_categories + coding_priorities[i];
6365                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6366                   break;
6367                 }
6368         }
6369     }
6370   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6371            == coding_category_utf_8_auto)
6372     {
6373       Lisp_Object coding_systems;
6374       struct coding_detection_info detect_info;
6375
6376       coding_systems
6377         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6378       detect_info.found = detect_info.rejected = 0;
6379       coding->head_ascii = 0;
6380       if (CONSP (coding_systems)
6381           && detect_coding_utf_8 (coding, &detect_info))
6382         {
6383           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6384             setup_coding_system (XCAR (coding_systems), coding);
6385           else
6386             setup_coding_system (XCDR (coding_systems), coding);
6387         }
6388     }
6389   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6390            == coding_category_utf_16_auto)
6391     {
6392       Lisp_Object coding_systems;
6393       struct coding_detection_info detect_info;
6394
6395       coding_systems
6396         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6397       detect_info.found = detect_info.rejected = 0;
6398       coding->head_ascii = 0;
6399       if (CONSP (coding_systems)
6400           && detect_coding_utf_16 (coding, &detect_info))
6401         {
6402           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6403             setup_coding_system (XCAR (coding_systems), coding);
6404           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6405             setup_coding_system (XCDR (coding_systems), coding);
6406         }
6407     }
6408   coding->mode = saved_mode;
6409 }
6410
6411
6412 static void
6413 decode_eol (struct coding_system *coding)
6414 {
6415   Lisp_Object eol_type;
6416   unsigned char *p, *pbeg, *pend;
6417
6418   eol_type = CODING_ID_EOL_TYPE (coding->id);
6419   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6420     return;
6421
6422   if (NILP (coding->dst_object))
6423     pbeg = coding->destination;
6424   else
6425     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6426   pend = pbeg + coding->produced;
6427
6428   if (VECTORP (eol_type))
6429     {
6430       int eol_seen = EOL_SEEN_NONE;
6431
6432       for (p = pbeg; p < pend; p++)
6433         {
6434           if (*p == '\n')
6435             eol_seen |= EOL_SEEN_LF;
6436           else if (*p == '\r')
6437             {
6438               if (p + 1 < pend && *(p + 1) == '\n')
6439                 {
6440                   eol_seen |= EOL_SEEN_CRLF;
6441                   p++;
6442                 }
6443               else
6444                 eol_seen |= EOL_SEEN_CR;
6445             }
6446         }
6447       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6448       if ((eol_seen & EOL_SEEN_CRLF) != 0
6449           && (eol_seen & EOL_SEEN_CR) != 0
6450           && (eol_seen & EOL_SEEN_LF) == 0)
6451         eol_seen = EOL_SEEN_CRLF;
6452       else if (eol_seen != EOL_SEEN_NONE
6453           && eol_seen != EOL_SEEN_LF
6454           && eol_seen != EOL_SEEN_CRLF
6455           && eol_seen != EOL_SEEN_CR)
6456         eol_seen = EOL_SEEN_LF;
6457       if (eol_seen != EOL_SEEN_NONE)
6458         eol_type = adjust_coding_eol_type (coding, eol_seen);
6459     }
6460
6461   if (EQ (eol_type, Qmac))
6462     {
6463       for (p = pbeg; p < pend; p++)
6464         if (*p == '\r')
6465           *p = '\n';
6466     }
6467   else if (EQ (eol_type, Qdos))
6468     {
6469       int n = 0;
6470
6471       if (NILP (coding->dst_object))
6472         {
6473           /* Start deleting '\r' from the tail to minimize the memory
6474              movement.  */
6475           for (p = pend - 2; p >= pbeg; p--)
6476             if (*p == '\r')
6477               {
6478                 memmove (p, p + 1, pend-- - p - 1);
6479                 n++;
6480               }
6481         }
6482       else
6483         {
6484           int pos_byte = coding->dst_pos_byte;
6485           int pos = coding->dst_pos;
6486           int pos_end = pos + coding->produced_char - 1;
6487
6488           while (pos < pos_end)
6489             {
6490               p = BYTE_POS_ADDR (pos_byte);
6491               if (*p == '\r' && p[1] == '\n')
6492                 {
6493                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6494                   n++;
6495                   pos_end--;
6496                 }
6497               pos++;
6498               if (coding->dst_multibyte)
6499                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6500               else
6501                 pos_byte++;
6502             }
6503         }
6504       coding->produced -= n;
6505       coding->produced_char -= n;
6506     }
6507 }
6508
6509
6510 /* Return a translation table (or list of them) from coding system
6511    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6512    decoding (ENCODEP is zero). */
6513
6514 static Lisp_Object
6515 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6516 {
6517   Lisp_Object standard, translation_table;
6518   Lisp_Object val;
6519
6520   if (NILP (Venable_character_translation))
6521     {
6522       if (max_lookup)
6523         *max_lookup = 0;
6524       return Qnil;
6525     }
6526   if (encodep)
6527     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6528       standard = Vstandard_translation_table_for_encode;
6529   else
6530     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6531       standard = Vstandard_translation_table_for_decode;
6532   if (NILP (translation_table))
6533     translation_table = standard;
6534   else
6535     {
6536       if (SYMBOLP (translation_table))
6537         translation_table = Fget (translation_table, Qtranslation_table);
6538       else if (CONSP (translation_table))
6539         {
6540           translation_table = Fcopy_sequence (translation_table);
6541           for (val = translation_table; CONSP (val); val = XCDR (val))
6542             if (SYMBOLP (XCAR (val)))
6543               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6544         }
6545       if (CHAR_TABLE_P (standard))
6546         {
6547           if (CONSP (translation_table))
6548             translation_table = nconc2 (translation_table,
6549                                         Fcons (standard, Qnil));
6550           else
6551             translation_table = Fcons (translation_table,
6552                                        Fcons (standard, Qnil));
6553         }
6554     }
6555
6556   if (max_lookup)
6557     {
6558       *max_lookup = 1;
6559       if (CHAR_TABLE_P (translation_table)
6560           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6561         {
6562           val = XCHAR_TABLE (translation_table)->extras[1];
6563           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6564             *max_lookup = XFASTINT (val);
6565         }
6566       else if (CONSP (translation_table))
6567         {
6568           Lisp_Object tail;
6569
6570           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6571             if (CHAR_TABLE_P (XCAR (tail))
6572                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6573               {
6574                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6575                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6576                   *max_lookup = XFASTINT (tailval);
6577               }
6578         }
6579     }
6580   return translation_table;
6581 }
6582
6583 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6584   do {                                                          \
6585     trans = Qnil;                                               \
6586     if (CHAR_TABLE_P (table))                                   \
6587       {                                                         \
6588         trans = CHAR_TABLE_REF (table, c);                      \
6589         if (CHARACTERP (trans))                                 \
6590           c = XFASTINT (trans), trans = Qnil;                   \
6591       }                                                         \
6592     else if (CONSP (table))                                     \
6593       {                                                         \
6594         Lisp_Object tail;                                       \
6595                                                                 \
6596         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6597           if (CHAR_TABLE_P (XCAR (tail)))                       \
6598             {                                                   \
6599               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6600               if (CHARACTERP (trans))                           \
6601                 c = XFASTINT (trans), trans = Qnil;             \
6602               else if (! NILP (trans))                          \
6603                 break;                                          \
6604             }                                                   \
6605       }                                                         \
6606   } while (0)
6607
6608
6609 /* Return a translation of character(s) at BUF according to TRANS.
6610    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6611    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6612    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6613    translation is found, and Qnil if not found..
6614    If BUF is too short to lookup characters in FROM, return Qt.  */
6615
6616 static Lisp_Object
6617 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6618 {
6619
6620   if (INTEGERP (trans))
6621     return trans;
6622   for (; CONSP (trans); trans = XCDR (trans))
6623     {
6624       Lisp_Object val = XCAR (trans);
6625       Lisp_Object from = XCAR (val);
6626       int len = ASIZE (from);
6627       int i;
6628
6629       for (i = 0; i < len; i++)
6630         {
6631           if (buf + i == buf_end)
6632             return Qt;
6633           if (XINT (AREF (from, i)) != buf[i])
6634             break;
6635         }
6636       if (i == len)
6637         return val;
6638     }
6639   return Qnil;
6640 }
6641
6642
6643 static int
6644 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6645                int last_block)
6646 {
6647   unsigned char *dst = coding->destination + coding->produced;
6648   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6649   EMACS_INT produced;
6650   EMACS_INT produced_chars = 0;
6651   int carryover = 0;
6652
6653   if (! coding->chars_at_source)
6654     {
6655       /* Source characters are in coding->charbuf.  */
6656       int *buf = coding->charbuf;
6657       int *buf_end = buf + coding->charbuf_used;
6658
6659       if (EQ (coding->src_object, coding->dst_object))
6660         {
6661           coding_set_source (coding);
6662           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6663         }
6664
6665       while (buf < buf_end)
6666         {
6667           int c = *buf, i;
6668
6669           if (c >= 0)
6670             {
6671               int from_nchars = 1, to_nchars = 1;
6672               Lisp_Object trans = Qnil;
6673
6674               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6675               if (! NILP (trans))
6676                 {
6677                   trans = get_translation (trans, buf, buf_end);
6678                   if (INTEGERP (trans))
6679                     c = XINT (trans);
6680                   else if (CONSP (trans))
6681                     {
6682                       from_nchars = ASIZE (XCAR (trans));
6683                       trans = XCDR (trans);
6684                       if (INTEGERP (trans))
6685                         c = XINT (trans);
6686                       else
6687                         {
6688                           to_nchars = ASIZE (trans);
6689                           c = XINT (AREF (trans, 0));
6690                         }
6691                     }
6692                   else if (EQ (trans, Qt) && ! last_block)
6693                     break;
6694                 }
6695
6696               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6697                 {
6698                   dst = alloc_destination (coding,
6699                                            buf_end - buf
6700                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6701                                            dst);
6702                   if (EQ (coding->src_object, coding->dst_object))
6703                     {
6704                       coding_set_source (coding);
6705                       dst_end = (((unsigned char *) coding->source)
6706                                  + coding->consumed);
6707                     }
6708                   else
6709                     dst_end = coding->destination + coding->dst_bytes;
6710                 }
6711
6712               for (i = 0; i < to_nchars; i++)
6713                 {
6714                   if (i > 0)
6715                     c = XINT (AREF (trans, i));
6716                   if (coding->dst_multibyte
6717                       || ! CHAR_BYTE8_P (c))
6718                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6719                   else
6720                     *dst++ = CHAR_TO_BYTE8 (c);
6721                 }
6722               produced_chars += to_nchars;
6723               buf += from_nchars;
6724             }
6725           else
6726             /* This is an annotation datum.  (-C) is the length.  */
6727             buf += -c;
6728         }
6729       carryover = buf_end - buf;
6730     }
6731   else
6732     {
6733       /* Source characters are at coding->source.  */
6734       const unsigned char *src = coding->source;
6735       const unsigned char *src_end = src + coding->consumed;
6736
6737       if (EQ (coding->dst_object, coding->src_object))
6738         dst_end = (unsigned char *) src;
6739       if (coding->src_multibyte != coding->dst_multibyte)
6740         {
6741           if (coding->src_multibyte)
6742             {
6743               int multibytep = 1;
6744               EMACS_INT consumed_chars = 0;
6745
6746               while (1)
6747                 {
6748                   const unsigned char *src_base = src;
6749                   int c;
6750
6751                   ONE_MORE_BYTE (c);
6752                   if (dst == dst_end)
6753                     {
6754                       if (EQ (coding->src_object, coding->dst_object))
6755                         dst_end = (unsigned char *) src;
6756                       if (dst == dst_end)
6757                         {
6758                           EMACS_INT offset = src - coding->source;
6759
6760                           dst = alloc_destination (coding, src_end - src + 1,
6761                                                    dst);
6762                           dst_end = coding->destination + coding->dst_bytes;
6763                           coding_set_source (coding);
6764                           src = coding->source + offset;
6765                           src_end = coding->source + coding->src_bytes;
6766                           if (EQ (coding->src_object, coding->dst_object))
6767                             dst_end = (unsigned char *) src;
6768                         }
6769                     }
6770                   *dst++ = c;
6771                   produced_chars++;
6772                 }
6773             no_more_source:
6774               ;
6775             }
6776           else
6777             while (src < src_end)
6778               {
6779                 int multibytep = 1;
6780                 int c = *src++;
6781
6782                 if (dst >= dst_end - 1)
6783                   {
6784                     if (EQ (coding->src_object, coding->dst_object))
6785                       dst_end = (unsigned char *) src;
6786                     if (dst >= dst_end - 1)
6787                       {
6788                         EMACS_INT offset = src - coding->source;
6789                         EMACS_INT more_bytes;
6790
6791                         if (EQ (coding->src_object, coding->dst_object))
6792                           more_bytes = ((src_end - src) / 2) + 2;
6793                         else
6794                           more_bytes = src_end - src + 2;
6795                         dst = alloc_destination (coding, more_bytes, dst);
6796                         dst_end = coding->destination + coding->dst_bytes;
6797                         coding_set_source (coding);
6798                         src = coding->source + offset;
6799                         src_end = coding->source + coding->src_bytes;
6800                         if (EQ (coding->src_object, coding->dst_object))
6801                           dst_end = (unsigned char *) src;
6802                       }
6803                   }
6804                 EMIT_ONE_BYTE (c);
6805               }
6806         }
6807       else
6808         {
6809           if (!EQ (coding->src_object, coding->dst_object))
6810             {
6811               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6812
6813               if (require > 0)
6814                 {
6815                   EMACS_INT offset = src - coding->source;
6816
6817                   dst = alloc_destination (coding, require, dst);
6818                   coding_set_source (coding);
6819                   src = coding->source + offset;
6820                   src_end = coding->source + coding->src_bytes;
6821                 }
6822             }
6823           produced_chars = coding->consumed_char;
6824           while (src < src_end)
6825             *dst++ = *src++;
6826         }
6827     }
6828
6829   produced = dst - (coding->destination + coding->produced);
6830   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6831     insert_from_gap (produced_chars, produced);
6832   coding->produced += produced;
6833   coding->produced_char += produced_chars;
6834   return carryover;
6835 }
6836
6837 /* Compose text in CODING->object according to the annotation data at
6838    CHARBUF.  CHARBUF is an array:
6839      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6840  */
6841
6842 static INLINE void
6843 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6844 {
6845   int len;
6846   EMACS_INT to;
6847   enum composition_method method;
6848   Lisp_Object components;
6849
6850   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6851   to = pos + charbuf[2];
6852   method = (enum composition_method) (charbuf[4]);
6853
6854   if (method == COMPOSITION_RELATIVE)
6855     components = Qnil;
6856   else
6857     {
6858       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6859       int i, j;
6860
6861       if (method == COMPOSITION_WITH_RULE)
6862         len = charbuf[2] * 3 - 2;
6863       charbuf += MAX_ANNOTATION_LENGTH;
6864       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6865       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6866         {
6867           if (charbuf[i] >= 0)
6868             args[j] = make_number (charbuf[i]);
6869           else
6870             {
6871               i++;
6872               args[j] = make_number (charbuf[i] % 0x100);
6873             }
6874         }
6875       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6876     }
6877   compose_text (pos, to, components, Qnil, coding->dst_object);
6878 }
6879
6880
6881 /* Put `charset' property on text in CODING->object according to
6882    the annotation data at CHARBUF.  CHARBUF is an array:
6883      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6884  */
6885
6886 static INLINE void
6887 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6888 {
6889   EMACS_INT from = pos - charbuf[2];
6890   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6891
6892   Fput_text_property (make_number (from), make_number (pos),
6893                       Qcharset, CHARSET_NAME (charset),
6894                       coding->dst_object);
6895 }
6896
6897
6898 #define CHARBUF_SIZE 0x4000
6899
6900 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6901   do {                                                                  \
6902     int size = CHARBUF_SIZE;                                            \
6903                                                                         \
6904     coding->charbuf = NULL;                                             \
6905     while (size > 1024)                                                 \
6906       {                                                                 \
6907         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6908         if (coding->charbuf)                                            \
6909           break;                                                        \
6910         size >>= 1;                                                     \
6911       }                                                                 \
6912     if (! coding->charbuf)                                              \
6913       {                                                                 \
6914         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6915         return coding->result;                                          \
6916       }                                                                 \
6917     coding->charbuf_size = size;                                        \
6918   } while (0)
6919
6920
6921 static void
6922 produce_annotation (struct coding_system *coding, EMACS_INT pos)
6923 {
6924   int *charbuf = coding->charbuf;
6925   int *charbuf_end = charbuf + coding->charbuf_used;
6926
6927   if (NILP (coding->dst_object))
6928     return;
6929
6930   while (charbuf < charbuf_end)
6931     {
6932       if (*charbuf >= 0)
6933         pos++, charbuf++;
6934       else
6935         {
6936           int len = -*charbuf;
6937
6938           if (len > 2)
6939             switch (charbuf[1])
6940               {
6941               case CODING_ANNOTATE_COMPOSITION_MASK:
6942                 produce_composition (coding, charbuf, pos);
6943                 break;
6944               case CODING_ANNOTATE_CHARSET_MASK:
6945                 produce_charset (coding, charbuf, pos);
6946                 break;
6947               }
6948           charbuf += len;
6949         }
6950     }
6951 }
6952
6953 /* Decode the data at CODING->src_object into CODING->dst_object.
6954    CODING->src_object is a buffer, a string, or nil.
6955    CODING->dst_object is a buffer.
6956
6957    If CODING->src_object is a buffer, it must be the current buffer.
6958    In this case, if CODING->src_pos is positive, it is a position of
6959    the source text in the buffer, otherwise, the source text is in the
6960    gap area of the buffer, and CODING->src_pos specifies the offset of
6961    the text from GPT (which must be the same as PT).  If this is the
6962    same buffer as CODING->dst_object, CODING->src_pos must be
6963    negative.
6964
6965    If CODING->src_object is a string, CODING->src_pos is an index to
6966    that string.
6967
6968    If CODING->src_object is nil, CODING->source must already point to
6969    the non-relocatable memory area.  In this case, CODING->src_pos is
6970    an offset from CODING->source.
6971
6972    The decoded data is inserted at the current point of the buffer
6973    CODING->dst_object.
6974 */
6975
6976 static int
6977 decode_coding (struct coding_system *coding)
6978 {
6979   Lisp_Object attrs;
6980   Lisp_Object undo_list;
6981   Lisp_Object translation_table;
6982   struct ccl_spec cclspec;
6983   int carryover;
6984   int i;
6985
6986   if (BUFFERP (coding->src_object)
6987       && coding->src_pos > 0
6988       && coding->src_pos < GPT
6989       && coding->src_pos + coding->src_chars > GPT)
6990     move_gap_both (coding->src_pos, coding->src_pos_byte);
6991
6992   undo_list = Qt;
6993   if (BUFFERP (coding->dst_object))
6994     {
6995       if (current_buffer != XBUFFER (coding->dst_object))
6996         set_buffer_internal (XBUFFER (coding->dst_object));
6997       if (GPT != PT)
6998         move_gap_both (PT, PT_BYTE);
6999       undo_list = BVAR (current_buffer, undo_list);
7000       BVAR (current_buffer, undo_list) = Qt;
7001     }
7002
7003   coding->consumed = coding->consumed_char = 0;
7004   coding->produced = coding->produced_char = 0;
7005   coding->chars_at_source = 0;
7006   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7007   coding->errors = 0;
7008
7009   ALLOC_CONVERSION_WORK_AREA (coding);
7010
7011   attrs = CODING_ID_ATTRS (coding->id);
7012   translation_table = get_translation_table (attrs, 0, NULL);
7013
7014   carryover = 0;
7015   if (coding->decoder == decode_coding_ccl)
7016     {
7017       coding->spec.ccl = &cclspec;
7018       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7019     }
7020   do
7021     {
7022       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7023
7024       coding_set_source (coding);
7025       coding->annotated = 0;
7026       coding->charbuf_used = carryover;
7027       (*(coding->decoder)) (coding);
7028       coding_set_destination (coding);
7029       carryover = produce_chars (coding, translation_table, 0);
7030       if (coding->annotated)
7031         produce_annotation (coding, pos);
7032       for (i = 0; i < carryover; i++)
7033         coding->charbuf[i]
7034           = coding->charbuf[coding->charbuf_used - carryover + i];
7035     }
7036   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7037          || (coding->consumed < coding->src_bytes
7038              && (coding->result == CODING_RESULT_SUCCESS
7039                  || coding->result == CODING_RESULT_INVALID_SRC)));
7040
7041   if (carryover > 0)
7042     {
7043       coding_set_destination (coding);
7044       coding->charbuf_used = carryover;
7045       produce_chars (coding, translation_table, 1);
7046     }
7047
7048   coding->carryover_bytes = 0;
7049   if (coding->consumed < coding->src_bytes)
7050     {
7051       int nbytes = coding->src_bytes - coding->consumed;
7052       const unsigned char *src;
7053
7054       coding_set_source (coding);
7055       coding_set_destination (coding);
7056       src = coding->source + coding->consumed;
7057
7058       if (coding->mode & CODING_MODE_LAST_BLOCK)
7059         {
7060           /* Flush out unprocessed data as binary chars.  We are sure
7061              that the number of data is less than the size of
7062              coding->charbuf.  */
7063           coding->charbuf_used = 0;
7064           coding->chars_at_source = 0;
7065
7066           while (nbytes-- > 0)
7067             {
7068               int c = *src++;
7069
7070               if (c & 0x80)
7071                 c = BYTE8_TO_CHAR (c);
7072               coding->charbuf[coding->charbuf_used++] = c;
7073             }
7074           produce_chars (coding, Qnil, 1);
7075         }
7076       else
7077         {
7078           /* Record unprocessed bytes in coding->carryover.  We are
7079              sure that the number of data is less than the size of
7080              coding->carryover.  */
7081           unsigned char *p = coding->carryover;
7082
7083           if (nbytes > sizeof coding->carryover)
7084             nbytes = sizeof coding->carryover;
7085           coding->carryover_bytes = nbytes;
7086           while (nbytes-- > 0)
7087             *p++ = *src++;
7088         }
7089       coding->consumed = coding->src_bytes;
7090     }
7091
7092   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7093       && !inhibit_eol_conversion)
7094     decode_eol (coding);
7095   if (BUFFERP (coding->dst_object))
7096     {
7097       BVAR (current_buffer, undo_list) = undo_list;
7098       record_insert (coding->dst_pos, coding->produced_char);
7099     }
7100   return coding->result;
7101 }
7102
7103
7104 /* Extract an annotation datum from a composition starting at POS and
7105    ending before LIMIT of CODING->src_object (buffer or string), store
7106    the data in BUF, set *STOP to a starting position of the next
7107    composition (if any) or to LIMIT, and return the address of the
7108    next element of BUF.
7109
7110    If such an annotation is not found, set *STOP to a starting
7111    position of a composition after POS (if any) or to LIMIT, and
7112    return BUF.  */
7113
7114 static INLINE int *
7115 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7116                                struct coding_system *coding, int *buf,
7117                                EMACS_INT *stop)
7118 {
7119   EMACS_INT start, end;
7120   Lisp_Object prop;
7121
7122   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7123       || end > limit)
7124     *stop = limit;
7125   else if (start > pos)
7126     *stop = start;
7127   else
7128     {
7129       if (start == pos)
7130         {
7131           /* We found a composition.  Store the corresponding
7132              annotation data in BUF.  */
7133           int *head = buf;
7134           enum composition_method method = COMPOSITION_METHOD (prop);
7135           int nchars = COMPOSITION_LENGTH (prop);
7136
7137           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7138           if (method != COMPOSITION_RELATIVE)
7139             {
7140               Lisp_Object components;
7141               int len, i, i_byte;
7142
7143               components = COMPOSITION_COMPONENTS (prop);
7144               if (VECTORP (components))
7145                 {
7146                   len = XVECTOR (components)->size;
7147                   for (i = 0; i < len; i++)
7148                     *buf++ = XINT (AREF (components, i));
7149                 }
7150               else if (STRINGP (components))
7151                 {
7152                   len = SCHARS (components);
7153                   i = i_byte = 0;
7154                   while (i < len)
7155                     {
7156                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7157                       buf++;
7158                     }
7159                 }
7160               else if (INTEGERP (components))
7161                 {
7162                   len = 1;
7163                   *buf++ = XINT (components);
7164                 }
7165               else if (CONSP (components))
7166                 {
7167                   for (len = 0; CONSP (components);
7168                        len++, components = XCDR (components))
7169                     *buf++ = XINT (XCAR (components));
7170                 }
7171               else
7172                 abort ();
7173               *head -= len;
7174             }
7175         }
7176
7177       if (find_composition (end, limit, &start, &end, &prop,
7178                             coding->src_object)
7179           && end <= limit)
7180         *stop = start;
7181       else
7182         *stop = limit;
7183     }
7184   return buf;
7185 }
7186
7187
7188 /* Extract an annotation datum from a text property `charset' at POS of
7189    CODING->src_object (buffer of string), store the data in BUF, set
7190    *STOP to the position where the value of `charset' property changes
7191    (limiting by LIMIT), and return the address of the next element of
7192    BUF.
7193
7194    If the property value is nil, set *STOP to the position where the
7195    property value is non-nil (limiting by LIMIT), and return BUF.  */
7196
7197 static INLINE int *
7198 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7199                            struct coding_system *coding, int *buf,
7200                            EMACS_INT *stop)
7201 {
7202   Lisp_Object val, next;
7203   int id;
7204
7205   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7206   if (! NILP (val) && CHARSETP (val))
7207     id = XINT (CHARSET_SYMBOL_ID (val));
7208   else
7209     id = -1;
7210   ADD_CHARSET_DATA (buf, 0, id);
7211   next = Fnext_single_property_change (make_number (pos), Qcharset,
7212                                        coding->src_object,
7213                                        make_number (limit));
7214   *stop = XINT (next);
7215   return buf;
7216 }
7217
7218
7219 static void
7220 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7221                int max_lookup)
7222 {
7223   int *buf = coding->charbuf;
7224   int *buf_end = coding->charbuf + coding->charbuf_size;
7225   const unsigned char *src = coding->source + coding->consumed;
7226   const unsigned char *src_end = coding->source + coding->src_bytes;
7227   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7228   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7229   int multibytep = coding->src_multibyte;
7230   Lisp_Object eol_type;
7231   int c;
7232   EMACS_INT stop, stop_composition, stop_charset;
7233   int *lookup_buf = NULL;
7234
7235   if (! NILP (translation_table))
7236     lookup_buf = alloca (sizeof (int) * max_lookup);
7237
7238   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7239   if (VECTORP (eol_type))
7240     eol_type = Qunix;
7241
7242   /* Note: composition handling is not yet implemented.  */
7243   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7244
7245   if (NILP (coding->src_object))
7246     stop = stop_composition = stop_charset = end_pos;
7247   else
7248     {
7249       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7250         stop = stop_composition = pos;
7251       else
7252         stop = stop_composition = end_pos;
7253       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7254         stop = stop_charset = pos;
7255       else
7256         stop_charset = end_pos;
7257     }
7258
7259   /* Compensate for CRLF and conversion.  */
7260   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7261   while (buf < buf_end)
7262     {
7263       Lisp_Object trans;
7264
7265       if (pos == stop)
7266         {
7267           if (pos == end_pos)
7268             break;
7269           if (pos == stop_composition)
7270             buf = handle_composition_annotation (pos, end_pos, coding,
7271                                                  buf, &stop_composition);
7272           if (pos == stop_charset)
7273             buf = handle_charset_annotation (pos, end_pos, coding,
7274                                              buf, &stop_charset);
7275           stop = (stop_composition < stop_charset
7276                   ? stop_composition : stop_charset);
7277         }
7278
7279       if (! multibytep)
7280         {
7281           EMACS_INT bytes;
7282
7283           if (coding->encoder == encode_coding_raw_text
7284               || coding->encoder == encode_coding_ccl)
7285             c = *src++, pos++;
7286           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7287             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7288           else
7289             c = BYTE8_TO_CHAR (*src), src++, pos++;
7290         }
7291       else
7292         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7293       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7294         c = '\n';
7295       if (! EQ (eol_type, Qunix))
7296         {
7297           if (c == '\n')
7298             {
7299               if (EQ (eol_type, Qdos))
7300                 *buf++ = '\r';
7301               else
7302                 c = '\r';
7303             }
7304         }
7305
7306       trans = Qnil;
7307       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7308       if (NILP (trans))
7309         *buf++ = c;
7310       else
7311         {
7312           int from_nchars = 1, to_nchars = 1;
7313           int *lookup_buf_end;
7314           const unsigned char *p = src;
7315           int i;
7316
7317           lookup_buf[0] = c;
7318           for (i = 1; i < max_lookup && p < src_end; i++)
7319             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7320           lookup_buf_end = lookup_buf + i;
7321           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7322           if (INTEGERP (trans))
7323             c = XINT (trans);
7324           else if (CONSP (trans))
7325             {
7326               from_nchars = ASIZE (XCAR (trans));
7327               trans = XCDR (trans);
7328               if (INTEGERP (trans))
7329                 c = XINT (trans);
7330               else
7331                 {
7332                   to_nchars = ASIZE (trans);
7333                   if (buf + to_nchars > buf_end)
7334                     break;
7335                   c = XINT (AREF (trans, 0));
7336                 }
7337             }
7338           else
7339             break;
7340           *buf++ = c;
7341           for (i = 1; i < to_nchars; i++)
7342             *buf++ = XINT (AREF (trans, i));
7343           for (i = 1; i < from_nchars; i++, pos++)
7344             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7345         }
7346     }
7347
7348   coding->consumed = src - coding->source;
7349   coding->consumed_char = pos - coding->src_pos;
7350   coding->charbuf_used = buf - coding->charbuf;
7351   coding->chars_at_source = 0;
7352 }
7353
7354
7355 /* Encode the text at CODING->src_object into CODING->dst_object.
7356    CODING->src_object is a buffer or a string.
7357    CODING->dst_object is a buffer or nil.
7358
7359    If CODING->src_object is a buffer, it must be the current buffer.
7360    In this case, if CODING->src_pos is positive, it is a position of
7361    the source text in the buffer, otherwise. the source text is in the
7362    gap area of the buffer, and coding->src_pos specifies the offset of
7363    the text from GPT (which must be the same as PT).  If this is the
7364    same buffer as CODING->dst_object, CODING->src_pos must be
7365    negative and CODING should not have `pre-write-conversion'.
7366
7367    If CODING->src_object is a string, CODING should not have
7368    `pre-write-conversion'.
7369
7370    If CODING->dst_object is a buffer, the encoded data is inserted at
7371    the current point of that buffer.
7372
7373    If CODING->dst_object is nil, the encoded data is placed at the
7374    memory area specified by CODING->destination.  */
7375
7376 static int
7377 encode_coding (struct coding_system *coding)
7378 {
7379   Lisp_Object attrs;
7380   Lisp_Object translation_table;
7381   int max_lookup;
7382   struct ccl_spec cclspec;
7383
7384   attrs = CODING_ID_ATTRS (coding->id);
7385   if (coding->encoder == encode_coding_raw_text)
7386     translation_table = Qnil, max_lookup = 0;
7387   else
7388     translation_table = get_translation_table (attrs, 1, &max_lookup);
7389
7390   if (BUFFERP (coding->dst_object))
7391     {
7392       set_buffer_internal (XBUFFER (coding->dst_object));
7393       coding->dst_multibyte
7394         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7395     }
7396
7397   coding->consumed = coding->consumed_char = 0;
7398   coding->produced = coding->produced_char = 0;
7399   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7400   coding->errors = 0;
7401
7402   ALLOC_CONVERSION_WORK_AREA (coding);
7403
7404   if (coding->encoder == encode_coding_ccl)
7405     {
7406       coding->spec.ccl = &cclspec;
7407       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7408     }
7409   do {
7410     coding_set_source (coding);
7411     consume_chars (coding, translation_table, max_lookup);
7412     coding_set_destination (coding);
7413     (*(coding->encoder)) (coding);
7414   } while (coding->consumed_char < coding->src_chars);
7415
7416   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7417     insert_from_gap (coding->produced_char, coding->produced);
7418
7419   return (coding->result);
7420 }
7421
7422
7423 /* Name (or base name) of work buffer for code conversion.  */
7424 static Lisp_Object Vcode_conversion_workbuf_name;
7425
7426 /* A working buffer used by the top level conversion.  Once it is
7427    created, it is never destroyed.  It has the name
7428    Vcode_conversion_workbuf_name.  The other working buffers are
7429    destroyed after the use is finished, and their names are modified
7430    versions of Vcode_conversion_workbuf_name.  */
7431 static Lisp_Object Vcode_conversion_reused_workbuf;
7432
7433 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7434 static int reused_workbuf_in_use;
7435
7436
7437 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7438    multibyteness of returning buffer.  */
7439
7440 static Lisp_Object
7441 make_conversion_work_buffer (int multibyte)
7442 {
7443   Lisp_Object name, workbuf;
7444   struct buffer *current;
7445
7446   if (reused_workbuf_in_use++)
7447     {
7448       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7449       workbuf = Fget_buffer_create (name);
7450     }
7451   else
7452     {
7453       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7454         Vcode_conversion_reused_workbuf
7455           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7456       workbuf = Vcode_conversion_reused_workbuf;
7457     }
7458   current = current_buffer;
7459   set_buffer_internal (XBUFFER (workbuf));
7460   /* We can't allow modification hooks to run in the work buffer.  For
7461      instance, directory_files_internal assumes that file decoding
7462      doesn't compile new regexps.  */
7463   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7464   Ferase_buffer ();
7465   BVAR (current_buffer, undo_list) = Qt;
7466   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7467   set_buffer_internal (current);
7468   return workbuf;
7469 }
7470
7471
7472 static Lisp_Object
7473 code_conversion_restore (Lisp_Object arg)
7474 {
7475   Lisp_Object current, workbuf;
7476   struct gcpro gcpro1;
7477
7478   GCPRO1 (arg);
7479   current = XCAR (arg);
7480   workbuf = XCDR (arg);
7481   if (! NILP (workbuf))
7482     {
7483       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7484         reused_workbuf_in_use = 0;
7485       else if (! NILP (Fbuffer_live_p (workbuf)))
7486         Fkill_buffer (workbuf);
7487     }
7488   set_buffer_internal (XBUFFER (current));
7489   UNGCPRO;
7490   return Qnil;
7491 }
7492
7493 Lisp_Object
7494 code_conversion_save (int with_work_buf, int multibyte)
7495 {
7496   Lisp_Object workbuf = Qnil;
7497
7498   if (with_work_buf)
7499     workbuf = make_conversion_work_buffer (multibyte);
7500   record_unwind_protect (code_conversion_restore,
7501                          Fcons (Fcurrent_buffer (), workbuf));
7502   return workbuf;
7503 }
7504
7505 int
7506 decode_coding_gap (struct coding_system *coding,
7507                    EMACS_INT chars, EMACS_INT bytes)
7508 {
7509   int count = SPECPDL_INDEX ();
7510   Lisp_Object attrs;
7511
7512   code_conversion_save (0, 0);
7513
7514   coding->src_object = Fcurrent_buffer ();
7515   coding->src_chars = chars;
7516   coding->src_bytes = bytes;
7517   coding->src_pos = -chars;
7518   coding->src_pos_byte = -bytes;
7519   coding->src_multibyte = chars < bytes;
7520   coding->dst_object = coding->src_object;
7521   coding->dst_pos = PT;
7522   coding->dst_pos_byte = PT_BYTE;
7523   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7524
7525   if (CODING_REQUIRE_DETECTION (coding))
7526     detect_coding (coding);
7527
7528   coding->mode |= CODING_MODE_LAST_BLOCK;
7529   current_buffer->text->inhibit_shrinking = 1;
7530   decode_coding (coding);
7531   current_buffer->text->inhibit_shrinking = 0;
7532
7533   attrs = CODING_ID_ATTRS (coding->id);
7534   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7535     {
7536       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7537       Lisp_Object val;
7538
7539       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7540       val = call1 (CODING_ATTR_POST_READ (attrs),
7541                    make_number (coding->produced_char));
7542       CHECK_NATNUM (val);
7543       coding->produced_char += Z - prev_Z;
7544       coding->produced += Z_BYTE - prev_Z_BYTE;
7545     }
7546
7547   unbind_to (count, Qnil);
7548   return coding->result;
7549 }
7550
7551 int
7552 encode_coding_gap (struct coding_system *coding,
7553                    EMACS_INT chars, EMACS_INT bytes)
7554 {
7555   int count = SPECPDL_INDEX ();
7556
7557   code_conversion_save (0, 0);
7558
7559   coding->src_object = Fcurrent_buffer ();
7560   coding->src_chars = chars;
7561   coding->src_bytes = bytes;
7562   coding->src_pos = -chars;
7563   coding->src_pos_byte = -bytes;
7564   coding->src_multibyte = chars < bytes;
7565   coding->dst_object = coding->src_object;
7566   coding->dst_pos = PT;
7567   coding->dst_pos_byte = PT_BYTE;
7568
7569   encode_coding (coding);
7570
7571   unbind_to (count, Qnil);
7572   return coding->result;
7573 }
7574
7575
7576 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7577    SRC_OBJECT into DST_OBJECT by coding context CODING.
7578
7579    SRC_OBJECT is a buffer, a string, or Qnil.
7580
7581    If it is a buffer, the text is at point of the buffer.  FROM and TO
7582    are positions in the buffer.
7583
7584    If it is a string, the text is at the beginning of the string.
7585    FROM and TO are indices to the string.
7586
7587    If it is nil, the text is at coding->source.  FROM and TO are
7588    indices to coding->source.
7589
7590    DST_OBJECT is a buffer, Qt, or Qnil.
7591
7592    If it is a buffer, the decoded text is inserted at point of the
7593    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7594    is deleted.
7595
7596    If it is Qt, a string is made from the decoded text, and
7597    set in CODING->dst_object.
7598
7599    If it is Qnil, the decoded text is stored at CODING->destination.
7600    The caller must allocate CODING->dst_bytes bytes at
7601    CODING->destination by xmalloc.  If the decoded text is longer than
7602    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7603  */
7604
7605 void
7606 decode_coding_object (struct coding_system *coding,
7607                       Lisp_Object src_object,
7608                       EMACS_INT from, EMACS_INT from_byte,
7609                       EMACS_INT to, EMACS_INT to_byte,
7610                       Lisp_Object dst_object)
7611 {
7612   int count = SPECPDL_INDEX ();
7613   unsigned char *destination IF_LINT (= NULL);
7614   EMACS_INT dst_bytes IF_LINT (= 0);
7615   EMACS_INT chars = to - from;
7616   EMACS_INT bytes = to_byte - from_byte;
7617   Lisp_Object attrs;
7618   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7619   int need_marker_adjustment = 0;
7620   Lisp_Object old_deactivate_mark;
7621
7622   old_deactivate_mark = Vdeactivate_mark;
7623
7624   if (NILP (dst_object))
7625     {
7626       destination = coding->destination;
7627       dst_bytes = coding->dst_bytes;
7628     }
7629
7630   coding->src_object = src_object;
7631   coding->src_chars = chars;
7632   coding->src_bytes = bytes;
7633   coding->src_multibyte = chars < bytes;
7634
7635   if (STRINGP (src_object))
7636     {
7637       coding->src_pos = from;
7638       coding->src_pos_byte = from_byte;
7639     }
7640   else if (BUFFERP (src_object))
7641     {
7642       set_buffer_internal (XBUFFER (src_object));
7643       if (from != GPT)
7644         move_gap_both (from, from_byte);
7645       if (EQ (src_object, dst_object))
7646         {
7647           struct Lisp_Marker *tail;
7648
7649           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7650             {
7651               tail->need_adjustment
7652                 = tail->charpos == (tail->insertion_type ? from : to);
7653               need_marker_adjustment |= tail->need_adjustment;
7654             }
7655           saved_pt = PT, saved_pt_byte = PT_BYTE;
7656           TEMP_SET_PT_BOTH (from, from_byte);
7657           current_buffer->text->inhibit_shrinking = 1;
7658           del_range_both (from, from_byte, to, to_byte, 1);
7659           coding->src_pos = -chars;
7660           coding->src_pos_byte = -bytes;
7661         }
7662       else
7663         {
7664           coding->src_pos = from;
7665           coding->src_pos_byte = from_byte;
7666         }
7667     }
7668
7669   if (CODING_REQUIRE_DETECTION (coding))
7670     detect_coding (coding);
7671   attrs = CODING_ID_ATTRS (coding->id);
7672
7673   if (EQ (dst_object, Qt)
7674       || (! NILP (CODING_ATTR_POST_READ (attrs))
7675           && NILP (dst_object)))
7676     {
7677       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7678       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7679       coding->dst_pos = BEG;
7680       coding->dst_pos_byte = BEG_BYTE;
7681     }
7682   else if (BUFFERP (dst_object))
7683     {
7684       code_conversion_save (0, 0);
7685       coding->dst_object = dst_object;
7686       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7687       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7688       coding->dst_multibyte
7689         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7690     }
7691   else
7692     {
7693       code_conversion_save (0, 0);
7694       coding->dst_object = Qnil;
7695       /* Most callers presume this will return a multibyte result, and they
7696          won't use `binary' or `raw-text' anyway, so let's not worry about
7697          CODING_FOR_UNIBYTE.  */
7698       coding->dst_multibyte = 1;
7699     }
7700
7701   decode_coding (coding);
7702
7703   if (BUFFERP (coding->dst_object))
7704     set_buffer_internal (XBUFFER (coding->dst_object));
7705
7706   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7707     {
7708       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7709       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7710       Lisp_Object val;
7711
7712       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7713       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7714               old_deactivate_mark);
7715       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7716                         make_number (coding->produced_char));
7717       UNGCPRO;
7718       CHECK_NATNUM (val);
7719       coding->produced_char += Z - prev_Z;
7720       coding->produced += Z_BYTE - prev_Z_BYTE;
7721     }
7722
7723   if (EQ (dst_object, Qt))
7724     {
7725       coding->dst_object = Fbuffer_string ();
7726     }
7727   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7728     {
7729       set_buffer_internal (XBUFFER (coding->dst_object));
7730       if (dst_bytes < coding->produced)
7731         {
7732           destination = xrealloc (destination, coding->produced);
7733           if (! destination)
7734             {
7735               record_conversion_result (coding,
7736                                         CODING_RESULT_INSUFFICIENT_MEM);
7737               unbind_to (count, Qnil);
7738               return;
7739             }
7740           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7741             move_gap_both (BEGV, BEGV_BYTE);
7742           memcpy (destination, BEGV_ADDR, coding->produced);
7743           coding->destination = destination;
7744         }
7745     }
7746
7747   if (saved_pt >= 0)
7748     {
7749       /* This is the case of:
7750          (BUFFERP (src_object) && EQ (src_object, dst_object))
7751          As we have moved PT while replacing the original buffer
7752          contents, we must recover it now.  */
7753       set_buffer_internal (XBUFFER (src_object));
7754       current_buffer->text->inhibit_shrinking = 0;
7755       if (saved_pt < from)
7756         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7757       else if (saved_pt < from + chars)
7758         TEMP_SET_PT_BOTH (from, from_byte);
7759       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7760         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7761                           saved_pt_byte + (coding->produced - bytes));
7762       else
7763         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7764                           saved_pt_byte + (coding->produced - bytes));
7765
7766       if (need_marker_adjustment)
7767         {
7768           struct Lisp_Marker *tail;
7769
7770           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7771             if (tail->need_adjustment)
7772               {
7773                 tail->need_adjustment = 0;
7774                 if (tail->insertion_type)
7775                   {
7776                     tail->bytepos = from_byte;
7777                     tail->charpos = from;
7778                   }
7779                 else
7780                   {
7781                     tail->bytepos = from_byte + coding->produced;
7782                     tail->charpos
7783                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7784                          ? tail->bytepos : from + coding->produced_char);
7785                   }
7786               }
7787         }
7788     }
7789
7790   Vdeactivate_mark = old_deactivate_mark;
7791   unbind_to (count, coding->dst_object);
7792 }
7793
7794
7795 void
7796 encode_coding_object (struct coding_system *coding,
7797                       Lisp_Object src_object,
7798                       EMACS_INT from, EMACS_INT from_byte,
7799                       EMACS_INT to, EMACS_INT to_byte,
7800                       Lisp_Object dst_object)
7801 {
7802   int count = SPECPDL_INDEX ();
7803   EMACS_INT chars = to - from;
7804   EMACS_INT bytes = to_byte - from_byte;
7805   Lisp_Object attrs;
7806   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7807   int need_marker_adjustment = 0;
7808   int kill_src_buffer = 0;
7809   Lisp_Object old_deactivate_mark;
7810
7811   old_deactivate_mark = Vdeactivate_mark;
7812
7813   coding->src_object = src_object;
7814   coding->src_chars = chars;
7815   coding->src_bytes = bytes;
7816   coding->src_multibyte = chars < bytes;
7817
7818   attrs = CODING_ID_ATTRS (coding->id);
7819
7820   if (EQ (src_object, dst_object))
7821     {
7822       struct Lisp_Marker *tail;
7823
7824       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7825         {
7826           tail->need_adjustment
7827             = tail->charpos == (tail->insertion_type ? from : to);
7828           need_marker_adjustment |= tail->need_adjustment;
7829         }
7830     }
7831
7832   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7833     {
7834       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7835       set_buffer_internal (XBUFFER (coding->src_object));
7836       if (STRINGP (src_object))
7837         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7838       else if (BUFFERP (src_object))
7839         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7840       else
7841         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7842
7843       if (EQ (src_object, dst_object))
7844         {
7845           set_buffer_internal (XBUFFER (src_object));
7846           saved_pt = PT, saved_pt_byte = PT_BYTE;
7847           del_range_both (from, from_byte, to, to_byte, 1);
7848           set_buffer_internal (XBUFFER (coding->src_object));
7849         }
7850
7851       {
7852         Lisp_Object args[3];
7853         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7854
7855         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7856                 old_deactivate_mark);
7857         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7858         args[1] = make_number (BEG);
7859         args[2] = make_number (Z);
7860         safe_call (3, args);
7861         UNGCPRO;
7862       }
7863       if (XBUFFER (coding->src_object) != current_buffer)
7864         kill_src_buffer = 1;
7865       coding->src_object = Fcurrent_buffer ();
7866       if (BEG != GPT)
7867         move_gap_both (BEG, BEG_BYTE);
7868       coding->src_chars = Z - BEG;
7869       coding->src_bytes = Z_BYTE - BEG_BYTE;
7870       coding->src_pos = BEG;
7871       coding->src_pos_byte = BEG_BYTE;
7872       coding->src_multibyte = Z < Z_BYTE;
7873     }
7874   else if (STRINGP (src_object))
7875     {
7876       code_conversion_save (0, 0);
7877       coding->src_pos = from;
7878       coding->src_pos_byte = from_byte;
7879     }
7880   else if (BUFFERP (src_object))
7881     {
7882       code_conversion_save (0, 0);
7883       set_buffer_internal (XBUFFER (src_object));
7884       if (EQ (src_object, dst_object))
7885         {
7886           saved_pt = PT, saved_pt_byte = PT_BYTE;
7887           coding->src_object = del_range_1 (from, to, 1, 1);
7888           coding->src_pos = 0;
7889           coding->src_pos_byte = 0;
7890         }
7891       else
7892         {
7893           if (from < GPT && to >= GPT)
7894             move_gap_both (from, from_byte);
7895           coding->src_pos = from;
7896           coding->src_pos_byte = from_byte;
7897         }
7898     }
7899   else
7900     code_conversion_save (0, 0);
7901
7902   if (BUFFERP (dst_object))
7903     {
7904       coding->dst_object = dst_object;
7905       if (EQ (src_object, dst_object))
7906         {
7907           coding->dst_pos = from;
7908           coding->dst_pos_byte = from_byte;
7909         }
7910       else
7911         {
7912           struct buffer *current = current_buffer;
7913
7914           set_buffer_temp (XBUFFER (dst_object));
7915           coding->dst_pos = PT;
7916           coding->dst_pos_byte = PT_BYTE;
7917           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7918           set_buffer_temp (current);
7919         }
7920       coding->dst_multibyte
7921         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7922     }
7923   else if (EQ (dst_object, Qt))
7924     {
7925       coding->dst_object = Qnil;
7926       coding->dst_bytes = coding->src_chars;
7927       if (coding->dst_bytes == 0)
7928         coding->dst_bytes = 1;
7929       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7930       coding->dst_multibyte = 0;
7931     }
7932   else
7933     {
7934       coding->dst_object = Qnil;
7935       coding->dst_multibyte = 0;
7936     }
7937
7938   encode_coding (coding);
7939
7940   if (EQ (dst_object, Qt))
7941     {
7942       if (BUFFERP (coding->dst_object))
7943         coding->dst_object = Fbuffer_string ();
7944       else
7945         {
7946           coding->dst_object
7947             = make_unibyte_string ((char *) coding->destination,
7948                                    coding->produced);
7949           xfree (coding->destination);
7950         }
7951     }
7952
7953   if (saved_pt >= 0)
7954     {
7955       /* This is the case of:
7956          (BUFFERP (src_object) && EQ (src_object, dst_object))
7957          As we have moved PT while replacing the original buffer
7958          contents, we must recover it now.  */
7959       set_buffer_internal (XBUFFER (src_object));
7960       if (saved_pt < from)
7961         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7962       else if (saved_pt < from + chars)
7963         TEMP_SET_PT_BOTH (from, from_byte);
7964       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7965         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7966                           saved_pt_byte + (coding->produced - bytes));
7967       else
7968         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7969                           saved_pt_byte + (coding->produced - bytes));
7970
7971       if (need_marker_adjustment)
7972         {
7973           struct Lisp_Marker *tail;
7974
7975           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7976             if (tail->need_adjustment)
7977               {
7978                 tail->need_adjustment = 0;
7979                 if (tail->insertion_type)
7980                   {
7981                     tail->bytepos = from_byte;
7982                     tail->charpos = from;
7983                   }
7984                 else
7985                   {
7986                     tail->bytepos = from_byte + coding->produced;
7987                     tail->charpos
7988                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7989                          ? tail->bytepos : from + coding->produced_char);
7990                   }
7991               }
7992         }
7993     }
7994
7995   if (kill_src_buffer)
7996     Fkill_buffer (coding->src_object);
7997
7998   Vdeactivate_mark = old_deactivate_mark;
7999   unbind_to (count, Qnil);
8000 }
8001
8002
8003 Lisp_Object
8004 preferred_coding_system (void)
8005 {
8006   int id = coding_categories[coding_priorities[0]].id;
8007
8008   return CODING_ID_NAME (id);
8009 }
8010
8011 \f
8012 #ifdef emacs
8013 /*** 8. Emacs Lisp library functions ***/
8014
8015 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8016        doc: /* Return t if OBJECT is nil or a coding-system.
8017 See the documentation of `define-coding-system' for information
8018 about coding-system objects.  */)
8019   (Lisp_Object object)
8020 {
8021   if (NILP (object)
8022       || CODING_SYSTEM_ID (object) >= 0)
8023     return Qt;
8024   if (! SYMBOLP (object)
8025       || NILP (Fget (object, Qcoding_system_define_form)))
8026     return Qnil;
8027   return Qt;
8028 }
8029
8030 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8031        Sread_non_nil_coding_system, 1, 1, 0,
8032        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8033   (Lisp_Object prompt)
8034 {
8035   Lisp_Object val;
8036   do
8037     {
8038       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8039                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8040     }
8041   while (SCHARS (val) == 0);
8042   return (Fintern (val, Qnil));
8043 }
8044
8045 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8046        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8047 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8048 Ignores case when completing coding systems (all Emacs coding systems
8049 are lower-case).  */)
8050   (Lisp_Object prompt, Lisp_Object default_coding_system)
8051 {
8052   Lisp_Object val;
8053   int count = SPECPDL_INDEX ();
8054
8055   if (SYMBOLP (default_coding_system))
8056     default_coding_system = SYMBOL_NAME (default_coding_system);
8057   specbind (Qcompletion_ignore_case, Qt);
8058   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8059                           Qt, Qnil, Qcoding_system_history,
8060                           default_coding_system, Qnil);
8061   unbind_to (count, Qnil);
8062   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8063 }
8064
8065 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8066        1, 1, 0,
8067        doc: /* Check validity of CODING-SYSTEM.
8068 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8069 It is valid if it is nil or a symbol defined as a coding system by the
8070 function `define-coding-system'.  */)
8071   (Lisp_Object coding_system)
8072 {
8073   Lisp_Object define_form;
8074
8075   define_form = Fget (coding_system, Qcoding_system_define_form);
8076   if (! NILP (define_form))
8077     {
8078       Fput (coding_system, Qcoding_system_define_form, Qnil);
8079       safe_eval (define_form);
8080     }
8081   if (!NILP (Fcoding_system_p (coding_system)))
8082     return coding_system;
8083   xsignal1 (Qcoding_system_error, coding_system);
8084 }
8085
8086 \f
8087 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8088    HIGHEST is nonzero, return the coding system of the highest
8089    priority among the detected coding systems.  Otherwise return a
8090    list of detected coding systems sorted by their priorities.  If
8091    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8092    multibyte form but contains only ASCII and eight-bit chars.
8093    Otherwise, the bytes are raw bytes.
8094
8095    CODING-SYSTEM controls the detection as below:
8096
8097    If it is nil, detect both text-format and eol-format.  If the
8098    text-format part of CODING-SYSTEM is already specified
8099    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8100    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8101    detect only text-format.  */
8102
8103 Lisp_Object
8104 detect_coding_system (const unsigned char *src,
8105                       EMACS_INT src_chars, EMACS_INT src_bytes,
8106                       int highest, int multibytep,
8107                       Lisp_Object coding_system)
8108 {
8109   const unsigned char *src_end = src + src_bytes;
8110   Lisp_Object attrs, eol_type;
8111   Lisp_Object val = Qnil;
8112   struct coding_system coding;
8113   int id;
8114   struct coding_detection_info detect_info;
8115   enum coding_category base_category;
8116   int null_byte_found = 0, eight_bit_found = 0;
8117
8118   if (NILP (coding_system))
8119     coding_system = Qundecided;
8120   setup_coding_system (coding_system, &coding);
8121   attrs = CODING_ID_ATTRS (coding.id);
8122   eol_type = CODING_ID_EOL_TYPE (coding.id);
8123   coding_system = CODING_ATTR_BASE_NAME (attrs);
8124
8125   coding.source = src;
8126   coding.src_chars = src_chars;
8127   coding.src_bytes = src_bytes;
8128   coding.src_multibyte = multibytep;
8129   coding.consumed = 0;
8130   coding.mode |= CODING_MODE_LAST_BLOCK;
8131   coding.head_ascii = 0;
8132
8133   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8134
8135   /* At first, detect text-format if necessary.  */
8136   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8137   if (base_category == coding_category_undecided)
8138     {
8139       enum coding_category category IF_LINT (= 0);
8140       struct coding_system *this IF_LINT (= NULL);
8141       int c, i;
8142
8143       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8144       for (; src < src_end; src++)
8145         {
8146           c = *src;
8147           if (c & 0x80)
8148             {
8149               eight_bit_found = 1;
8150               if (null_byte_found)
8151                 break;
8152             }
8153           else if (c < 0x20)
8154             {
8155               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8156                   && ! inhibit_iso_escape_detection
8157                   && ! detect_info.checked)
8158                 {
8159                   if (detect_coding_iso_2022 (&coding, &detect_info))
8160                     {
8161                       /* We have scanned the whole data.  */
8162                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8163                         {
8164                           /* We didn't find an 8-bit code.  We may
8165                              have found a null-byte, but it's very
8166                              rare that a binary file confirm to
8167                              ISO-2022.  */
8168                           src = src_end;
8169                           coding.head_ascii = src - coding.source;
8170                         }
8171                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8172                       break;
8173                     }
8174                 }
8175               else if (! c && !inhibit_null_byte_detection)
8176                 {
8177                   null_byte_found = 1;
8178                   if (eight_bit_found)
8179                     break;
8180                 }
8181               if (! eight_bit_found)
8182                 coding.head_ascii++;
8183             }
8184           else if (! eight_bit_found)
8185             coding.head_ascii++;
8186         }
8187
8188       if (null_byte_found || eight_bit_found
8189           || coding.head_ascii < coding.src_bytes
8190           || detect_info.found)
8191         {
8192           if (coding.head_ascii == coding.src_bytes)
8193             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8194             for (i = 0; i < coding_category_raw_text; i++)
8195               {
8196                 category = coding_priorities[i];
8197                 this = coding_categories + category;
8198                 if (detect_info.found & (1 << category))
8199                   break;
8200               }
8201           else
8202             {
8203               if (null_byte_found)
8204                 {
8205                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8206                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8207                 }
8208               for (i = 0; i < coding_category_raw_text; i++)
8209                 {
8210                   category = coding_priorities[i];
8211                   this = coding_categories + category;
8212
8213                   if (this->id < 0)
8214                     {
8215                       /* No coding system of this category is defined.  */
8216                       detect_info.rejected |= (1 << category);
8217                     }
8218                   else if (category >= coding_category_raw_text)
8219                     continue;
8220                   else if (detect_info.checked & (1 << category))
8221                     {
8222                       if (highest
8223                           && (detect_info.found & (1 << category)))
8224                         break;
8225                     }
8226                   else if ((*(this->detector)) (&coding, &detect_info)
8227                            && highest
8228                            && (detect_info.found & (1 << category)))
8229                     {
8230                       if (category == coding_category_utf_16_auto)
8231                         {
8232                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8233                             category = coding_category_utf_16_le;
8234                           else
8235                             category = coding_category_utf_16_be;
8236                         }
8237                       break;
8238                     }
8239                 }
8240             }
8241         }
8242
8243       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8244           || null_byte_found)
8245         {
8246           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8247           id = CODING_SYSTEM_ID (Qno_conversion);
8248           val = Fcons (make_number (id), Qnil);
8249         }
8250       else if (! detect_info.rejected && ! detect_info.found)
8251         {
8252           detect_info.found = CATEGORY_MASK_ANY;
8253           id = coding_categories[coding_category_undecided].id;
8254           val = Fcons (make_number (id), Qnil);
8255         }
8256       else if (highest)
8257         {
8258           if (detect_info.found)
8259             {
8260               detect_info.found = 1 << category;
8261               val = Fcons (make_number (this->id), Qnil);
8262             }
8263           else
8264             for (i = 0; i < coding_category_raw_text; i++)
8265               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8266                 {
8267                   detect_info.found = 1 << coding_priorities[i];
8268                   id = coding_categories[coding_priorities[i]].id;
8269                   val = Fcons (make_number (id), Qnil);
8270                   break;
8271                 }
8272         }
8273       else
8274         {
8275           int mask = detect_info.rejected | detect_info.found;
8276           int found = 0;
8277
8278           for (i = coding_category_raw_text - 1; i >= 0; i--)
8279             {
8280               category = coding_priorities[i];
8281               if (! (mask & (1 << category)))
8282                 {
8283                   found |= 1 << category;
8284                   id = coding_categories[category].id;
8285                   if (id >= 0)
8286                     val = Fcons (make_number (id), val);
8287                 }
8288             }
8289           for (i = coding_category_raw_text - 1; i >= 0; i--)
8290             {
8291               category = coding_priorities[i];
8292               if (detect_info.found & (1 << category))
8293                 {
8294                   id = coding_categories[category].id;
8295                   val = Fcons (make_number (id), val);
8296                 }
8297             }
8298           detect_info.found |= found;
8299         }
8300     }
8301   else if (base_category == coding_category_utf_8_auto)
8302     {
8303       if (detect_coding_utf_8 (&coding, &detect_info))
8304         {
8305           struct coding_system *this;
8306
8307           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8308             this = coding_categories + coding_category_utf_8_sig;
8309           else
8310             this = coding_categories + coding_category_utf_8_nosig;
8311           val = Fcons (make_number (this->id), Qnil);
8312         }
8313     }
8314   else if (base_category == coding_category_utf_16_auto)
8315     {
8316       if (detect_coding_utf_16 (&coding, &detect_info))
8317         {
8318           struct coding_system *this;
8319
8320           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8321             this = coding_categories + coding_category_utf_16_le;
8322           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8323             this = coding_categories + coding_category_utf_16_be;
8324           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8325             this = coding_categories + coding_category_utf_16_be_nosig;
8326           else
8327             this = coding_categories + coding_category_utf_16_le_nosig;
8328           val = Fcons (make_number (this->id), Qnil);
8329         }
8330     }
8331   else
8332     {
8333       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8334       val = Fcons (make_number (coding.id), Qnil);
8335     }
8336
8337   /* Then, detect eol-format if necessary.  */
8338   {
8339     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8340     Lisp_Object tail;
8341
8342     if (VECTORP (eol_type))
8343       {
8344         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8345           {
8346             if (null_byte_found)
8347               normal_eol = EOL_SEEN_LF;
8348             else
8349               normal_eol = detect_eol (coding.source, src_bytes,
8350                                        coding_category_raw_text);
8351           }
8352         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8353                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8354           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8355                                       coding_category_utf_16_be);
8356         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8357                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8358           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8359                                       coding_category_utf_16_le);
8360       }
8361     else
8362       {
8363         if (EQ (eol_type, Qunix))
8364           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8365         else if (EQ (eol_type, Qdos))
8366           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8367         else
8368           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8369       }
8370
8371     for (tail = val; CONSP (tail); tail = XCDR (tail))
8372       {
8373         enum coding_category category;
8374         int this_eol;
8375
8376         id = XINT (XCAR (tail));
8377         attrs = CODING_ID_ATTRS (id);
8378         category = XINT (CODING_ATTR_CATEGORY (attrs));
8379         eol_type = CODING_ID_EOL_TYPE (id);
8380         if (VECTORP (eol_type))
8381           {
8382             if (category == coding_category_utf_16_be
8383                 || category == coding_category_utf_16_be_nosig)
8384               this_eol = utf_16_be_eol;
8385             else if (category == coding_category_utf_16_le
8386                      || category == coding_category_utf_16_le_nosig)
8387               this_eol = utf_16_le_eol;
8388             else
8389               this_eol = normal_eol;
8390
8391             if (this_eol == EOL_SEEN_LF)
8392               XSETCAR (tail, AREF (eol_type, 0));
8393             else if (this_eol == EOL_SEEN_CRLF)
8394               XSETCAR (tail, AREF (eol_type, 1));
8395             else if (this_eol == EOL_SEEN_CR)
8396               XSETCAR (tail, AREF (eol_type, 2));
8397             else
8398               XSETCAR (tail, CODING_ID_NAME (id));
8399           }
8400         else
8401           XSETCAR (tail, CODING_ID_NAME (id));
8402       }
8403   }
8404
8405   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8406 }
8407
8408
8409 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8410        2, 3, 0,
8411        doc: /* Detect coding system of the text in the region between START and END.
8412 Return a list of possible coding systems ordered by priority.
8413 The coding systems to try and their priorities follows what
8414 the function `coding-system-priority-list' (which see) returns.
8415
8416 If only ASCII characters are found (except for such ISO-2022 control
8417 characters as ESC), it returns a list of single element `undecided'
8418 or its subsidiary coding system according to a detected end-of-line
8419 format.
8420
8421 If optional argument HIGHEST is non-nil, return the coding system of
8422 highest priority.  */)
8423   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8424 {
8425   int from, to;
8426   int from_byte, to_byte;
8427
8428   CHECK_NUMBER_COERCE_MARKER (start);
8429   CHECK_NUMBER_COERCE_MARKER (end);
8430
8431   validate_region (&start, &end);
8432   from = XINT (start), to = XINT (end);
8433   from_byte = CHAR_TO_BYTE (from);
8434   to_byte = CHAR_TO_BYTE (to);
8435
8436   if (from < GPT && to >= GPT)
8437     move_gap_both (to, to_byte);
8438
8439   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8440                                to - from, to_byte - from_byte,
8441                                !NILP (highest),
8442                                !NILP (BVAR (current_buffer
8443                                       , enable_multibyte_characters)),
8444                                Qnil);
8445 }
8446
8447 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8448        1, 2, 0,
8449        doc: /* Detect coding system of the text in STRING.
8450 Return a list of possible coding systems ordered by priority.
8451 The coding systems to try and their priorities follows what
8452 the function `coding-system-priority-list' (which see) returns.
8453
8454 If only ASCII characters are found (except for such ISO-2022 control
8455 characters as ESC), it returns a list of single element `undecided'
8456 or its subsidiary coding system according to a detected end-of-line
8457 format.
8458
8459 If optional argument HIGHEST is non-nil, return the coding system of
8460 highest priority.  */)
8461   (Lisp_Object string, Lisp_Object highest)
8462 {
8463   CHECK_STRING (string);
8464
8465   return detect_coding_system (SDATA (string),
8466                                SCHARS (string), SBYTES (string),
8467                                !NILP (highest), STRING_MULTIBYTE (string),
8468                                Qnil);
8469 }
8470
8471
8472 static INLINE int
8473 char_encodable_p (int c, Lisp_Object attrs)
8474 {
8475   Lisp_Object tail;
8476   struct charset *charset;
8477   Lisp_Object translation_table;
8478
8479   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8480   if (! NILP (translation_table))
8481     c = translate_char (translation_table, c);
8482   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8483        CONSP (tail); tail = XCDR (tail))
8484     {
8485       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8486       if (CHAR_CHARSET_P (c, charset))
8487         break;
8488     }
8489   return (! NILP (tail));
8490 }
8491
8492
8493 /* Return a list of coding systems that safely encode the text between
8494    START and END.  If EXCLUDE is non-nil, it is a list of coding
8495    systems not to check.  The returned list doesn't contain any such
8496    coding systems.  In any case, if the text contains only ASCII or is
8497    unibyte, return t.  */
8498
8499 DEFUN ("find-coding-systems-region-internal",
8500        Ffind_coding_systems_region_internal,
8501        Sfind_coding_systems_region_internal, 2, 3, 0,
8502        doc: /* Internal use only.  */)
8503   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8504 {
8505   Lisp_Object coding_attrs_list, safe_codings;
8506   EMACS_INT start_byte, end_byte;
8507   const unsigned char *p, *pbeg, *pend;
8508   int c;
8509   Lisp_Object tail, elt, work_table;
8510
8511   if (STRINGP (start))
8512     {
8513       if (!STRING_MULTIBYTE (start)
8514           || SCHARS (start) == SBYTES (start))
8515         return Qt;
8516       start_byte = 0;
8517       end_byte = SBYTES (start);
8518     }
8519   else
8520     {
8521       CHECK_NUMBER_COERCE_MARKER (start);
8522       CHECK_NUMBER_COERCE_MARKER (end);
8523       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8524         args_out_of_range (start, end);
8525       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8526         return Qt;
8527       start_byte = CHAR_TO_BYTE (XINT (start));
8528       end_byte = CHAR_TO_BYTE (XINT (end));
8529       if (XINT (end) - XINT (start) == end_byte - start_byte)
8530         return Qt;
8531
8532       if (XINT (start) < GPT && XINT (end) > GPT)
8533         {
8534           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8535             move_gap_both (XINT (start), start_byte);
8536           else
8537             move_gap_both (XINT (end), end_byte);
8538         }
8539     }
8540
8541   coding_attrs_list = Qnil;
8542   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8543     if (NILP (exclude)
8544         || NILP (Fmemq (XCAR (tail), exclude)))
8545       {
8546         Lisp_Object attrs;
8547
8548         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8549         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8550             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8551           {
8552             ASET (attrs, coding_attr_trans_tbl,
8553                   get_translation_table (attrs, 1, NULL));
8554             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8555           }
8556       }
8557
8558   if (STRINGP (start))
8559     p = pbeg = SDATA (start);
8560   else
8561     p = pbeg = BYTE_POS_ADDR (start_byte);
8562   pend = p + (end_byte - start_byte);
8563
8564   while (p < pend && ASCII_BYTE_P (*p)) p++;
8565   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8566
8567   work_table = Fmake_char_table (Qnil, Qnil);
8568   while (p < pend)
8569     {
8570       if (ASCII_BYTE_P (*p))
8571         p++;
8572       else
8573         {
8574           c = STRING_CHAR_ADVANCE (p);
8575           if (!NILP (char_table_ref (work_table, c)))
8576             /* This character was already checked.  Ignore it.  */
8577             continue;
8578
8579           charset_map_loaded = 0;
8580           for (tail = coding_attrs_list; CONSP (tail);)
8581             {
8582               elt = XCAR (tail);
8583               if (NILP (elt))
8584                 tail = XCDR (tail);
8585               else if (char_encodable_p (c, elt))
8586                 tail = XCDR (tail);
8587               else if (CONSP (XCDR (tail)))
8588                 {
8589                   XSETCAR (tail, XCAR (XCDR (tail)));
8590                   XSETCDR (tail, XCDR (XCDR (tail)));
8591                 }
8592               else
8593                 {
8594                   XSETCAR (tail, Qnil);
8595                   tail = XCDR (tail);
8596                 }
8597             }
8598           if (charset_map_loaded)
8599             {
8600               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8601
8602               if (STRINGP (start))
8603                 pbeg = SDATA (start);
8604               else
8605                 pbeg = BYTE_POS_ADDR (start_byte);
8606               p = pbeg + p_offset;
8607               pend = pbeg + pend_offset;
8608             }
8609           char_table_set (work_table, c, Qt);
8610         }
8611     }
8612
8613   safe_codings = list2 (Qraw_text, Qno_conversion);
8614   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8615     if (! NILP (XCAR (tail)))
8616       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8617
8618   return safe_codings;
8619 }
8620
8621
8622 DEFUN ("unencodable-char-position", Funencodable_char_position,
8623        Sunencodable_char_position, 3, 5, 0,
8624        doc: /*
8625 Return position of first un-encodable character in a region.
8626 START and END specify the region and CODING-SYSTEM specifies the
8627 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8628
8629 If optional 4th argument COUNT is non-nil, it specifies at most how
8630 many un-encodable characters to search.  In this case, the value is a
8631 list of positions.
8632
8633 If optional 5th argument STRING is non-nil, it is a string to search
8634 for un-encodable characters.  In that case, START and END are indexes
8635 to the string.  */)
8636   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8637 {
8638   int n;
8639   struct coding_system coding;
8640   Lisp_Object attrs, charset_list, translation_table;
8641   Lisp_Object positions;
8642   int from, to;
8643   const unsigned char *p, *stop, *pend;
8644   int ascii_compatible;
8645
8646   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8647   attrs = CODING_ID_ATTRS (coding.id);
8648   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8649     return Qnil;
8650   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8651   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8652   translation_table = get_translation_table (attrs, 1, NULL);
8653
8654   if (NILP (string))
8655     {
8656       validate_region (&start, &end);
8657       from = XINT (start);
8658       to = XINT (end);
8659       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8660           || (ascii_compatible
8661               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8662         return Qnil;
8663       p = CHAR_POS_ADDR (from);
8664       pend = CHAR_POS_ADDR (to);
8665       if (from < GPT && to >= GPT)
8666         stop = GPT_ADDR;
8667       else
8668         stop = pend;
8669     }
8670   else
8671     {
8672       CHECK_STRING (string);
8673       CHECK_NATNUM (start);
8674       CHECK_NATNUM (end);
8675       from = XINT (start);
8676       to = XINT (end);
8677       if (from > to
8678           || to > SCHARS (string))
8679         args_out_of_range_3 (string, start, end);
8680       if (! STRING_MULTIBYTE (string))
8681         return Qnil;
8682       p = SDATA (string) + string_char_to_byte (string, from);
8683       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8684       if (ascii_compatible && (to - from) == (pend - p))
8685         return Qnil;
8686     }
8687
8688   if (NILP (count))
8689     n = 1;
8690   else
8691     {
8692       CHECK_NATNUM (count);
8693       n = XINT (count);
8694     }
8695
8696   positions = Qnil;
8697   while (1)
8698     {
8699       int c;
8700
8701       if (ascii_compatible)
8702         while (p < stop && ASCII_BYTE_P (*p))
8703           p++, from++;
8704       if (p >= stop)
8705         {
8706           if (p >= pend)
8707             break;
8708           stop = pend;
8709           p = GAP_END_ADDR;
8710         }
8711
8712       c = STRING_CHAR_ADVANCE (p);
8713       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8714           && ! char_charset (translate_char (translation_table, c),
8715                              charset_list, NULL))
8716         {
8717           positions = Fcons (make_number (from), positions);
8718           n--;
8719           if (n == 0)
8720             break;
8721         }
8722
8723       from++;
8724     }
8725
8726   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8727 }
8728
8729
8730 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8731        Scheck_coding_systems_region, 3, 3, 0,
8732        doc: /* Check if the region is encodable by coding systems.
8733
8734 START and END are buffer positions specifying the region.
8735 CODING-SYSTEM-LIST is a list of coding systems to check.
8736
8737 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8738 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8739 whole region, POS0, POS1, ... are buffer positions where non-encodable
8740 characters are found.
8741
8742 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8743 value is nil.
8744
8745 START may be a string.  In that case, check if the string is
8746 encodable, and the value contains indices to the string instead of
8747 buffer positions.  END is ignored.
8748
8749 If the current buffer (or START if it is a string) is unibyte, the value
8750 is nil.  */)
8751   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8752 {
8753   Lisp_Object list;
8754   EMACS_INT start_byte, end_byte;
8755   int pos;
8756   const unsigned char *p, *pbeg, *pend;
8757   int c;
8758   Lisp_Object tail, elt, attrs;
8759
8760   if (STRINGP (start))
8761     {
8762       if (!STRING_MULTIBYTE (start)
8763           || SCHARS (start) == SBYTES (start))
8764         return Qnil;
8765       start_byte = 0;
8766       end_byte = SBYTES (start);
8767       pos = 0;
8768     }
8769   else
8770     {
8771       CHECK_NUMBER_COERCE_MARKER (start);
8772       CHECK_NUMBER_COERCE_MARKER (end);
8773       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8774         args_out_of_range (start, end);
8775       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8776         return Qnil;
8777       start_byte = CHAR_TO_BYTE (XINT (start));
8778       end_byte = CHAR_TO_BYTE (XINT (end));
8779       if (XINT (end) - XINT (start) == end_byte - start_byte)
8780         return Qnil;
8781
8782       if (XINT (start) < GPT && XINT (end) > GPT)
8783         {
8784           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8785             move_gap_both (XINT (start), start_byte);
8786           else
8787             move_gap_both (XINT (end), end_byte);
8788         }
8789       pos = XINT (start);
8790     }
8791
8792   list = Qnil;
8793   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8794     {
8795       elt = XCAR (tail);
8796       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8797       ASET (attrs, coding_attr_trans_tbl,
8798             get_translation_table (attrs, 1, NULL));
8799       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8800     }
8801
8802   if (STRINGP (start))
8803     p = pbeg = SDATA (start);
8804   else
8805     p = pbeg = BYTE_POS_ADDR (start_byte);
8806   pend = p + (end_byte - start_byte);
8807
8808   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8809   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8810
8811   while (p < pend)
8812     {
8813       if (ASCII_BYTE_P (*p))
8814         p++;
8815       else
8816         {
8817           c = STRING_CHAR_ADVANCE (p);
8818
8819           charset_map_loaded = 0;
8820           for (tail = list; CONSP (tail); tail = XCDR (tail))
8821             {
8822               elt = XCDR (XCAR (tail));
8823               if (! char_encodable_p (c, XCAR (elt)))
8824                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8825             }
8826           if (charset_map_loaded)
8827             {
8828               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8829
8830               if (STRINGP (start))
8831                 pbeg = SDATA (start);
8832               else
8833                 pbeg = BYTE_POS_ADDR (start_byte);
8834               p = pbeg + p_offset;
8835               pend = pbeg + pend_offset;
8836             }
8837         }
8838       pos++;
8839     }
8840
8841   tail = list;
8842   list = Qnil;
8843   for (; CONSP (tail); tail = XCDR (tail))
8844     {
8845       elt = XCAR (tail);
8846       if (CONSP (XCDR (XCDR (elt))))
8847         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8848                       list);
8849     }
8850
8851   return list;
8852 }
8853
8854
8855 Lisp_Object
8856 code_convert_region (Lisp_Object start, Lisp_Object end,
8857                      Lisp_Object coding_system, Lisp_Object dst_object,
8858                      int encodep, int norecord)
8859 {
8860   struct coding_system coding;
8861   EMACS_INT from, from_byte, to, to_byte;
8862   Lisp_Object src_object;
8863
8864   CHECK_NUMBER_COERCE_MARKER (start);
8865   CHECK_NUMBER_COERCE_MARKER (end);
8866   if (NILP (coding_system))
8867     coding_system = Qno_conversion;
8868   else
8869     CHECK_CODING_SYSTEM (coding_system);
8870   src_object = Fcurrent_buffer ();
8871   if (NILP (dst_object))
8872     dst_object = src_object;
8873   else if (! EQ (dst_object, Qt))
8874     CHECK_BUFFER (dst_object);
8875
8876   validate_region (&start, &end);
8877   from = XFASTINT (start);
8878   from_byte = CHAR_TO_BYTE (from);
8879   to = XFASTINT (end);
8880   to_byte = CHAR_TO_BYTE (to);
8881
8882   setup_coding_system (coding_system, &coding);
8883   coding.mode |= CODING_MODE_LAST_BLOCK;
8884
8885   if (encodep)
8886     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8887                           dst_object);
8888   else
8889     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8890                           dst_object);
8891   if (! norecord)
8892     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8893
8894   return (BUFFERP (dst_object)
8895           ? make_number (coding.produced_char)
8896           : coding.dst_object);
8897 }
8898
8899
8900 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8901        3, 4, "r\nzCoding system: ",
8902        doc: /* Decode the current region from the specified coding system.
8903 When called from a program, takes four arguments:
8904         START, END, CODING-SYSTEM, and DESTINATION.
8905 START and END are buffer positions.
8906
8907 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8908 If nil, the region between START and END is replaced by the decoded text.
8909 If buffer, the decoded text is inserted in that buffer after point (point
8910 does not move).
8911 In those cases, the length of the decoded text is returned.
8912 If DESTINATION is t, the decoded text is returned.
8913
8914 This function sets `last-coding-system-used' to the precise coding system
8915 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8916 not fully specified.)  */)
8917   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8918 {
8919   return code_convert_region (start, end, coding_system, destination, 0, 0);
8920 }
8921
8922 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8923        3, 4, "r\nzCoding system: ",
8924        doc: /* Encode the current region by specified coding system.
8925 When called from a program, takes four arguments:
8926         START, END, CODING-SYSTEM and DESTINATION.
8927 START and END are buffer positions.
8928
8929 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8930 If nil, the region between START and END is replace by the encoded text.
8931 If buffer, the encoded text is inserted in that buffer after point (point
8932 does not move).
8933 In those cases, the length of the encoded text is returned.
8934 If DESTINATION is t, the encoded text is returned.
8935
8936 This function sets `last-coding-system-used' to the precise coding system
8937 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8938 not fully specified.)  */)
8939   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8940 {
8941   return code_convert_region (start, end, coding_system, destination, 1, 0);
8942 }
8943
8944 Lisp_Object
8945 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8946                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8947 {
8948   struct coding_system coding;
8949   EMACS_INT chars, bytes;
8950
8951   CHECK_STRING (string);
8952   if (NILP (coding_system))
8953     {
8954       if (! norecord)
8955         Vlast_coding_system_used = Qno_conversion;
8956       if (NILP (dst_object))
8957         return (nocopy ? Fcopy_sequence (string) : string);
8958     }
8959
8960   if (NILP (coding_system))
8961     coding_system = Qno_conversion;
8962   else
8963     CHECK_CODING_SYSTEM (coding_system);
8964   if (NILP (dst_object))
8965     dst_object = Qt;
8966   else if (! EQ (dst_object, Qt))
8967     CHECK_BUFFER (dst_object);
8968
8969   setup_coding_system (coding_system, &coding);
8970   coding.mode |= CODING_MODE_LAST_BLOCK;
8971   chars = SCHARS (string);
8972   bytes = SBYTES (string);
8973   if (encodep)
8974     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8975   else
8976     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8977   if (! norecord)
8978     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8979
8980   return (BUFFERP (dst_object)
8981           ? make_number (coding.produced_char)
8982           : coding.dst_object);
8983 }
8984
8985
8986 /* Encode or decode STRING according to CODING_SYSTEM.
8987    Do not set Vlast_coding_system_used.
8988
8989    This function is called only from macros DECODE_FILE and
8990    ENCODE_FILE, thus we ignore character composition.  */
8991
8992 Lisp_Object
8993 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8994                               int encodep)
8995 {
8996   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8997 }
8998
8999
9000 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9001        2, 4, 0,
9002        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9003
9004 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9005 if the decoding operation is trivial.
9006
9007 Optional fourth arg BUFFER non-nil means that the decoded text is
9008 inserted in that buffer after point (point does not move).  In this
9009 case, the return value is the length of the decoded text.
9010
9011 This function sets `last-coding-system-used' to the precise coding system
9012 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9013 not fully specified.)  */)
9014   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9015 {
9016   return code_convert_string (string, coding_system, buffer,
9017                               0, ! NILP (nocopy), 0);
9018 }
9019
9020 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9021        2, 4, 0,
9022        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9023
9024 Optional third arg NOCOPY non-nil means it is OK to return STRING
9025 itself if the encoding operation is trivial.
9026
9027 Optional fourth arg BUFFER non-nil means that the encoded text is
9028 inserted in that buffer after point (point does not move).  In this
9029 case, the return value is the length of the encoded text.
9030
9031 This function sets `last-coding-system-used' to the precise coding system
9032 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9033 not fully specified.)  */)
9034   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9035 {
9036   return code_convert_string (string, coding_system, buffer,
9037                               1, ! NILP (nocopy), 1);
9038 }
9039
9040 \f
9041 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9042        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9043 Return the corresponding character.  */)
9044   (Lisp_Object code)
9045 {
9046   Lisp_Object spec, attrs, val;
9047   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9048   int c;
9049
9050   CHECK_NATNUM (code);
9051   c = XFASTINT (code);
9052   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9053   attrs = AREF (spec, 0);
9054
9055   if (ASCII_BYTE_P (c)
9056       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9057     return code;
9058
9059   val = CODING_ATTR_CHARSET_LIST (attrs);
9060   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9061   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9062   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9063
9064   if (c <= 0x7F)
9065     charset = charset_roman;
9066   else if (c >= 0xA0 && c < 0xDF)
9067     {
9068       charset = charset_kana;
9069       c -= 0x80;
9070     }
9071   else
9072     {
9073       int c1 = c >> 8, c2 = c & 0xFF;
9074
9075       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9076           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9077         error ("Invalid code: %d", code);
9078       SJIS_TO_JIS (c);
9079       charset = charset_kanji;
9080     }
9081   c = DECODE_CHAR (charset, c);
9082   if (c < 0)
9083     error ("Invalid code: %d", code);
9084   return make_number (c);
9085 }
9086
9087
9088 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9089        doc: /* Encode a Japanese character CH to shift_jis encoding.
9090 Return the corresponding code in SJIS.  */)
9091   (Lisp_Object ch)
9092 {
9093   Lisp_Object spec, attrs, charset_list;
9094   int c;
9095   struct charset *charset;
9096   unsigned code;
9097
9098   CHECK_CHARACTER (ch);
9099   c = XFASTINT (ch);
9100   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9101   attrs = AREF (spec, 0);
9102
9103   if (ASCII_CHAR_P (c)
9104       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9105     return ch;
9106
9107   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9108   charset = char_charset (c, charset_list, &code);
9109   if (code == CHARSET_INVALID_CODE (charset))
9110     error ("Can't encode by shift_jis encoding: %d", c);
9111   JIS_TO_SJIS (code);
9112
9113   return make_number (code);
9114 }
9115
9116 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9117        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9118 Return the corresponding character.  */)
9119   (Lisp_Object code)
9120 {
9121   Lisp_Object spec, attrs, val;
9122   struct charset *charset_roman, *charset_big5, *charset;
9123   int c;
9124
9125   CHECK_NATNUM (code);
9126   c = XFASTINT (code);
9127   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9128   attrs = AREF (spec, 0);
9129
9130   if (ASCII_BYTE_P (c)
9131       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9132     return code;
9133
9134   val = CODING_ATTR_CHARSET_LIST (attrs);
9135   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9136   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9137
9138   if (c <= 0x7F)
9139     charset = charset_roman;
9140   else
9141     {
9142       int b1 = c >> 8, b2 = c & 0x7F;
9143       if (b1 < 0xA1 || b1 > 0xFE
9144           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9145         error ("Invalid code: %d", code);
9146       charset = charset_big5;
9147     }
9148   c = DECODE_CHAR (charset, (unsigned )c);
9149   if (c < 0)
9150     error ("Invalid code: %d", code);
9151   return make_number (c);
9152 }
9153
9154 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9155        doc: /* Encode the Big5 character CH to BIG5 coding system.
9156 Return the corresponding character code in Big5.  */)
9157   (Lisp_Object ch)
9158 {
9159   Lisp_Object spec, attrs, charset_list;
9160   struct charset *charset;
9161   int c;
9162   unsigned code;
9163
9164   CHECK_CHARACTER (ch);
9165   c = XFASTINT (ch);
9166   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9167   attrs = AREF (spec, 0);
9168   if (ASCII_CHAR_P (c)
9169       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9170     return ch;
9171
9172   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9173   charset = char_charset (c, charset_list, &code);
9174   if (code == CHARSET_INVALID_CODE (charset))
9175     error ("Can't encode by Big5 encoding: %d", c);
9176
9177   return make_number (code);
9178 }
9179
9180 \f
9181 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9182        Sset_terminal_coding_system_internal, 1, 2, 0,
9183        doc: /* Internal use only.  */)
9184   (Lisp_Object coding_system, Lisp_Object terminal)
9185 {
9186   struct terminal *term = get_terminal (terminal, 1);
9187   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9188   CHECK_SYMBOL (coding_system);
9189   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9190   /* We had better not send unsafe characters to terminal.  */
9191   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9192   /* Character composition should be disabled.  */
9193   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9194   terminal_coding->src_multibyte = 1;
9195   terminal_coding->dst_multibyte = 0;
9196   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9197     term->charset_list = coding_charset_list (terminal_coding);
9198   else
9199     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9200   return Qnil;
9201 }
9202
9203 DEFUN ("set-safe-terminal-coding-system-internal",
9204        Fset_safe_terminal_coding_system_internal,
9205        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9206        doc: /* Internal use only.  */)
9207   (Lisp_Object coding_system)
9208 {
9209   CHECK_SYMBOL (coding_system);
9210   setup_coding_system (Fcheck_coding_system (coding_system),
9211                        &safe_terminal_coding);
9212   /* Character composition should be disabled.  */
9213   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9214   safe_terminal_coding.src_multibyte = 1;
9215   safe_terminal_coding.dst_multibyte = 0;
9216   return Qnil;
9217 }
9218
9219 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9220        Sterminal_coding_system, 0, 1, 0,
9221        doc: /* Return coding system specified for terminal output on the given terminal.
9222 TERMINAL may be a terminal object, a frame, or nil for the selected
9223 frame's terminal device.  */)
9224   (Lisp_Object terminal)
9225 {
9226   struct coding_system *terminal_coding
9227     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9228   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9229
9230   /* For backward compatibility, return nil if it is `undecided'. */
9231   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9232 }
9233
9234 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9235        Sset_keyboard_coding_system_internal, 1, 2, 0,
9236        doc: /* Internal use only.  */)
9237   (Lisp_Object coding_system, Lisp_Object terminal)
9238 {
9239   struct terminal *t = get_terminal (terminal, 1);
9240   CHECK_SYMBOL (coding_system);
9241   if (NILP (coding_system))
9242     coding_system = Qno_conversion;
9243   else
9244     Fcheck_coding_system (coding_system);
9245   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9246   /* Character composition should be disabled.  */
9247   TERMINAL_KEYBOARD_CODING (t)->common_flags
9248     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9249   return Qnil;
9250 }
9251
9252 DEFUN ("keyboard-coding-system",
9253        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9254        doc: /* Return coding system specified for decoding keyboard input.  */)
9255   (Lisp_Object terminal)
9256 {
9257   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9258                          (get_terminal (terminal, 1))->id);
9259 }
9260
9261 \f
9262 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9263        Sfind_operation_coding_system,  1, MANY, 0,
9264        doc: /* Choose a coding system for an operation based on the target name.
9265 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9266 DECODING-SYSTEM is the coding system to use for decoding
9267 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9268 for encoding (in case OPERATION does encoding).
9269
9270 The first argument OPERATION specifies an I/O primitive:
9271   For file I/O, `insert-file-contents' or `write-region'.
9272   For process I/O, `call-process', `call-process-region', or `start-process'.
9273   For network I/O, `open-network-stream'.
9274
9275 The remaining arguments should be the same arguments that were passed
9276 to the primitive.  Depending on which primitive, one of those arguments
9277 is selected as the TARGET.  For example, if OPERATION does file I/O,
9278 whichever argument specifies the file name is TARGET.
9279
9280 TARGET has a meaning which depends on OPERATION:
9281   For file I/O, TARGET is a file name (except for the special case below).
9282   For process I/O, TARGET is a process name.
9283   For network I/O, TARGET is a service name or a port number.
9284
9285 This function looks up what is specified for TARGET in
9286 `file-coding-system-alist', `process-coding-system-alist',
9287 or `network-coding-system-alist' depending on OPERATION.
9288 They may specify a coding system, a cons of coding systems,
9289 or a function symbol to call.
9290 In the last case, we call the function with one argument,
9291 which is a list of all the arguments given to this function.
9292 If the function can't decide a coding system, it can return
9293 `undecided' so that the normal code-detection is performed.
9294
9295 If OPERATION is `insert-file-contents', the argument corresponding to
9296 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9297 file name to look up, and BUFFER is a buffer that contains the file's
9298 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9299 function to call for FILENAME, that function should examine the
9300 contents of BUFFER instead of reading the file.
9301
9302 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9303   (size_t nargs, Lisp_Object *args)
9304 {
9305   Lisp_Object operation, target_idx, target, val;
9306   register Lisp_Object chain;
9307
9308   if (nargs < 2)
9309     error ("Too few arguments");
9310   operation = args[0];
9311   if (!SYMBOLP (operation)
9312       || !NATNUMP (target_idx = Fget (operation, Qtarget_idx)))
9313     error ("Invalid first argument");
9314   if (nargs < 1 + XFASTINT (target_idx))
9315     error ("Too few arguments for operation: %s",
9316            SDATA (SYMBOL_NAME (operation)));
9317   target = args[XFASTINT (target_idx) + 1];
9318   if (!(STRINGP (target)
9319         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9320             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9321         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9322     error ("Invalid %dth argument", XFASTINT (target_idx) + 1);
9323   if (CONSP (target))
9324     target = XCAR (target);
9325
9326   chain = ((EQ (operation, Qinsert_file_contents)
9327             || EQ (operation, Qwrite_region))
9328            ? Vfile_coding_system_alist
9329            : (EQ (operation, Qopen_network_stream)
9330               ? Vnetwork_coding_system_alist
9331               : Vprocess_coding_system_alist));
9332   if (NILP (chain))
9333     return Qnil;
9334
9335   for (; CONSP (chain); chain = XCDR (chain))
9336     {
9337       Lisp_Object elt;
9338
9339       elt = XCAR (chain);
9340       if (CONSP (elt)
9341           && ((STRINGP (target)
9342                && STRINGP (XCAR (elt))
9343                && fast_string_match (XCAR (elt), target) >= 0)
9344               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9345         {
9346           val = XCDR (elt);
9347           /* Here, if VAL is both a valid coding system and a valid
9348              function symbol, we return VAL as a coding system.  */
9349           if (CONSP (val))
9350             return val;
9351           if (! SYMBOLP (val))
9352             return Qnil;
9353           if (! NILP (Fcoding_system_p (val)))
9354             return Fcons (val, val);
9355           if (! NILP (Ffboundp (val)))
9356             {
9357               /* We use call1 rather than safe_call1
9358                  so as to get bug reports about functions called here
9359                  which don't handle the current interface.  */
9360               val = call1 (val, Flist (nargs, args));
9361               if (CONSP (val))
9362                 return val;
9363               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9364                 return Fcons (val, val);
9365             }
9366           return Qnil;
9367         }
9368     }
9369   return Qnil;
9370 }
9371
9372 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9373        Sset_coding_system_priority, 0, MANY, 0,
9374        doc: /* Assign higher priority to the coding systems given as arguments.
9375 If multiple coding systems belong to the same category,
9376 all but the first one are ignored.
9377
9378 usage: (set-coding-system-priority &rest coding-systems)  */)
9379   (size_t nargs, Lisp_Object *args)
9380 {
9381   size_t i, j;
9382   int changed[coding_category_max];
9383   enum coding_category priorities[coding_category_max];
9384
9385   memset (changed, 0, sizeof changed);
9386
9387   for (i = j = 0; i < nargs; i++)
9388     {
9389       enum coding_category category;
9390       Lisp_Object spec, attrs;
9391
9392       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9393       attrs = AREF (spec, 0);
9394       category = XINT (CODING_ATTR_CATEGORY (attrs));
9395       if (changed[category])
9396         /* Ignore this coding system because a coding system of the
9397            same category already had a higher priority.  */
9398         continue;
9399       changed[category] = 1;
9400       priorities[j++] = category;
9401       if (coding_categories[category].id >= 0
9402           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9403         setup_coding_system (args[i], &coding_categories[category]);
9404       Fset (AREF (Vcoding_category_table, category), args[i]);
9405     }
9406
9407   /* Now we have decided top J priorities.  Reflect the order of the
9408      original priorities to the remaining priorities.  */
9409
9410   for (i = j, j = 0; i < coding_category_max; i++, j++)
9411     {
9412       while (j < coding_category_max
9413              && changed[coding_priorities[j]])
9414         j++;
9415       if (j == coding_category_max)
9416         abort ();
9417       priorities[i] = coding_priorities[j];
9418     }
9419
9420   memcpy (coding_priorities, priorities, sizeof priorities);
9421
9422   /* Update `coding-category-list'.  */
9423   Vcoding_category_list = Qnil;
9424   for (i = coding_category_max; i-- > 0; )
9425     Vcoding_category_list
9426       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9427                Vcoding_category_list);
9428
9429   return Qnil;
9430 }
9431
9432 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9433        Scoding_system_priority_list, 0, 1, 0,
9434        doc: /* Return a list of coding systems ordered by their priorities.
9435 The list contains a subset of coding systems; i.e. coding systems
9436 assigned to each coding category (see `coding-category-list').
9437
9438 HIGHESTP non-nil means just return the highest priority one.  */)
9439   (Lisp_Object highestp)
9440 {
9441   int i;
9442   Lisp_Object val;
9443
9444   for (i = 0, val = Qnil; i < coding_category_max; i++)
9445     {
9446       enum coding_category category = coding_priorities[i];
9447       int id = coding_categories[category].id;
9448       Lisp_Object attrs;
9449
9450       if (id < 0)
9451         continue;
9452       attrs = CODING_ID_ATTRS (id);
9453       if (! NILP (highestp))
9454         return CODING_ATTR_BASE_NAME (attrs);
9455       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9456     }
9457   return Fnreverse (val);
9458 }
9459
9460 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9461
9462 static Lisp_Object
9463 make_subsidiaries (Lisp_Object base)
9464 {
9465   Lisp_Object subsidiaries;
9466   int base_name_len = SBYTES (SYMBOL_NAME (base));
9467   char *buf = (char *) alloca (base_name_len + 6);
9468   int i;
9469
9470   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9471   subsidiaries = Fmake_vector (make_number (3), Qnil);
9472   for (i = 0; i < 3; i++)
9473     {
9474       memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9475       ASET (subsidiaries, i, intern (buf));
9476     }
9477   return subsidiaries;
9478 }
9479
9480
9481 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9482        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9483        doc: /* For internal use only.
9484 usage: (define-coding-system-internal ...)  */)
9485   (size_t nargs, Lisp_Object *args)
9486 {
9487   Lisp_Object name;
9488   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9489   Lisp_Object attrs;            /* Vector of attributes.  */
9490   Lisp_Object eol_type;
9491   Lisp_Object aliases;
9492   Lisp_Object coding_type, charset_list, safe_charsets;
9493   enum coding_category category;
9494   Lisp_Object tail, val;
9495   int max_charset_id = 0;
9496   int i;
9497
9498   if (nargs < coding_arg_max)
9499     goto short_args;
9500
9501   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9502
9503   name = args[coding_arg_name];
9504   CHECK_SYMBOL (name);
9505   CODING_ATTR_BASE_NAME (attrs) = name;
9506
9507   val = args[coding_arg_mnemonic];
9508   if (! STRINGP (val))
9509     CHECK_CHARACTER (val);
9510   CODING_ATTR_MNEMONIC (attrs) = val;
9511
9512   coding_type = args[coding_arg_coding_type];
9513   CHECK_SYMBOL (coding_type);
9514   CODING_ATTR_TYPE (attrs) = coding_type;
9515
9516   charset_list = args[coding_arg_charset_list];
9517   if (SYMBOLP (charset_list))
9518     {
9519       if (EQ (charset_list, Qiso_2022))
9520         {
9521           if (! EQ (coding_type, Qiso_2022))
9522             error ("Invalid charset-list");
9523           charset_list = Viso_2022_charset_list;
9524         }
9525       else if (EQ (charset_list, Qemacs_mule))
9526         {
9527           if (! EQ (coding_type, Qemacs_mule))
9528             error ("Invalid charset-list");
9529           charset_list = Vemacs_mule_charset_list;
9530         }
9531       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9532         if (max_charset_id < XFASTINT (XCAR (tail)))
9533           max_charset_id = XFASTINT (XCAR (tail));
9534     }
9535   else
9536     {
9537       charset_list = Fcopy_sequence (charset_list);
9538       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9539         {
9540           struct charset *charset;
9541
9542           val = XCAR (tail);
9543           CHECK_CHARSET_GET_CHARSET (val, charset);
9544           if (EQ (coding_type, Qiso_2022)
9545               ? CHARSET_ISO_FINAL (charset) < 0
9546               : EQ (coding_type, Qemacs_mule)
9547               ? CHARSET_EMACS_MULE_ID (charset) < 0
9548               : 0)
9549             error ("Can't handle charset `%s'",
9550                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9551
9552           XSETCAR (tail, make_number (charset->id));
9553           if (max_charset_id < charset->id)
9554             max_charset_id = charset->id;
9555         }
9556     }
9557   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9558
9559   safe_charsets = make_uninit_string (max_charset_id + 1);
9560   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9561   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9562     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9563   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9564
9565   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9566
9567   val = args[coding_arg_decode_translation_table];
9568   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9569     CHECK_SYMBOL (val);
9570   CODING_ATTR_DECODE_TBL (attrs) = val;
9571
9572   val = args[coding_arg_encode_translation_table];
9573   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9574     CHECK_SYMBOL (val);
9575   CODING_ATTR_ENCODE_TBL (attrs) = val;
9576
9577   val = args[coding_arg_post_read_conversion];
9578   CHECK_SYMBOL (val);
9579   CODING_ATTR_POST_READ (attrs) = val;
9580
9581   val = args[coding_arg_pre_write_conversion];
9582   CHECK_SYMBOL (val);
9583   CODING_ATTR_PRE_WRITE (attrs) = val;
9584
9585   val = args[coding_arg_default_char];
9586   if (NILP (val))
9587     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9588   else
9589     {
9590       CHECK_CHARACTER (val);
9591       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9592     }
9593
9594   val = args[coding_arg_for_unibyte];
9595   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9596
9597   val = args[coding_arg_plist];
9598   CHECK_LIST (val);
9599   CODING_ATTR_PLIST (attrs) = val;
9600
9601   if (EQ (coding_type, Qcharset))
9602     {
9603       /* Generate a lisp vector of 256 elements.  Each element is nil,
9604          integer, or a list of charset IDs.
9605
9606          If Nth element is nil, the byte code N is invalid in this
9607          coding system.
9608
9609          If Nth element is a number NUM, N is the first byte of a
9610          charset whose ID is NUM.
9611
9612          If Nth element is a list of charset IDs, N is the first byte
9613          of one of them.  The list is sorted by dimensions of the
9614          charsets.  A charset of smaller dimension comes first. */
9615       val = Fmake_vector (make_number (256), Qnil);
9616
9617       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9618         {
9619           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9620           int dim = CHARSET_DIMENSION (charset);
9621           int idx = (dim - 1) * 4;
9622
9623           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9624             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9625
9626           for (i = charset->code_space[idx];
9627                i <= charset->code_space[idx + 1]; i++)
9628             {
9629               Lisp_Object tmp, tmp2;
9630               int dim2;
9631
9632               tmp = AREF (val, i);
9633               if (NILP (tmp))
9634                 tmp = XCAR (tail);
9635               else if (NUMBERP (tmp))
9636                 {
9637                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9638                   if (dim < dim2)
9639                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9640                   else
9641                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9642                 }
9643               else
9644                 {
9645                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9646                     {
9647                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9648                       if (dim < dim2)
9649                         break;
9650                     }
9651                   if (NILP (tmp2))
9652                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9653                   else
9654                     {
9655                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9656                       XSETCAR (tmp2, XCAR (tail));
9657                     }
9658                 }
9659               ASET (val, i, tmp);
9660             }
9661         }
9662       ASET (attrs, coding_attr_charset_valids, val);
9663       category = coding_category_charset;
9664     }
9665   else if (EQ (coding_type, Qccl))
9666     {
9667       Lisp_Object valids;
9668
9669       if (nargs < coding_arg_ccl_max)
9670         goto short_args;
9671
9672       val = args[coding_arg_ccl_decoder];
9673       CHECK_CCL_PROGRAM (val);
9674       if (VECTORP (val))
9675         val = Fcopy_sequence (val);
9676       ASET (attrs, coding_attr_ccl_decoder, val);
9677
9678       val = args[coding_arg_ccl_encoder];
9679       CHECK_CCL_PROGRAM (val);
9680       if (VECTORP (val))
9681         val = Fcopy_sequence (val);
9682       ASET (attrs, coding_attr_ccl_encoder, val);
9683
9684       val = args[coding_arg_ccl_valids];
9685       valids = Fmake_string (make_number (256), make_number (0));
9686       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9687         {
9688           int from, to;
9689
9690           val = Fcar (tail);
9691           if (INTEGERP (val))
9692             {
9693               from = to = XINT (val);
9694               if (from < 0 || from > 255)
9695                 args_out_of_range_3 (val, make_number (0), make_number (255));
9696             }
9697           else
9698             {
9699               CHECK_CONS (val);
9700               CHECK_NATNUM_CAR (val);
9701               CHECK_NATNUM_CDR (val);
9702               from = XINT (XCAR (val));
9703               if (from > 255)
9704                 args_out_of_range_3 (XCAR (val),
9705                                      make_number (0), make_number (255));
9706               to = XINT (XCDR (val));
9707               if (to < from || to > 255)
9708                 args_out_of_range_3 (XCDR (val),
9709                                      XCAR (val), make_number (255));
9710             }
9711           for (i = from; i <= to; i++)
9712             SSET (valids, i, 1);
9713         }
9714       ASET (attrs, coding_attr_ccl_valids, valids);
9715
9716       category = coding_category_ccl;
9717     }
9718   else if (EQ (coding_type, Qutf_16))
9719     {
9720       Lisp_Object bom, endian;
9721
9722       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9723
9724       if (nargs < coding_arg_utf16_max)
9725         goto short_args;
9726
9727       bom = args[coding_arg_utf16_bom];
9728       if (! NILP (bom) && ! EQ (bom, Qt))
9729         {
9730           CHECK_CONS (bom);
9731           val = XCAR (bom);
9732           CHECK_CODING_SYSTEM (val);
9733           val = XCDR (bom);
9734           CHECK_CODING_SYSTEM (val);
9735         }
9736       ASET (attrs, coding_attr_utf_bom, bom);
9737
9738       endian = args[coding_arg_utf16_endian];
9739       CHECK_SYMBOL (endian);
9740       if (NILP (endian))
9741         endian = Qbig;
9742       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9743         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9744       ASET (attrs, coding_attr_utf_16_endian, endian);
9745
9746       category = (CONSP (bom)
9747                   ? coding_category_utf_16_auto
9748                   : NILP (bom)
9749                   ? (EQ (endian, Qbig)
9750                      ? coding_category_utf_16_be_nosig
9751                      : coding_category_utf_16_le_nosig)
9752                   : (EQ (endian, Qbig)
9753                      ? coding_category_utf_16_be
9754                      : coding_category_utf_16_le));
9755     }
9756   else if (EQ (coding_type, Qiso_2022))
9757     {
9758       Lisp_Object initial, reg_usage, request, flags;
9759
9760       if (nargs < coding_arg_iso2022_max)
9761         goto short_args;
9762
9763       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9764       CHECK_VECTOR (initial);
9765       for (i = 0; i < 4; i++)
9766         {
9767           val = Faref (initial, make_number (i));
9768           if (! NILP (val))
9769             {
9770               struct charset *charset;
9771
9772               CHECK_CHARSET_GET_CHARSET (val, charset);
9773               ASET (initial, i, make_number (CHARSET_ID (charset)));
9774               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9775                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9776             }
9777           else
9778             ASET (initial, i, make_number (-1));
9779         }
9780
9781       reg_usage = args[coding_arg_iso2022_reg_usage];
9782       CHECK_CONS (reg_usage);
9783       CHECK_NUMBER_CAR (reg_usage);
9784       CHECK_NUMBER_CDR (reg_usage);
9785
9786       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9787       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9788         {
9789           int id;
9790           Lisp_Object tmp1;
9791
9792           val = Fcar (tail);
9793           CHECK_CONS (val);
9794           tmp1 = XCAR (val);
9795           CHECK_CHARSET_GET_ID (tmp1, id);
9796           CHECK_NATNUM_CDR (val);
9797           if (XINT (XCDR (val)) >= 4)
9798             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9799           XSETCAR (val, make_number (id));
9800         }
9801
9802       flags = args[coding_arg_iso2022_flags];
9803       CHECK_NATNUM (flags);
9804       i = XINT (flags);
9805       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9806         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9807
9808       ASET (attrs, coding_attr_iso_initial, initial);
9809       ASET (attrs, coding_attr_iso_usage, reg_usage);
9810       ASET (attrs, coding_attr_iso_request, request);
9811       ASET (attrs, coding_attr_iso_flags, flags);
9812       setup_iso_safe_charsets (attrs);
9813
9814       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9815         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9816                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9817                     ? coding_category_iso_7_else
9818                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9819                     ? coding_category_iso_7
9820                     : coding_category_iso_7_tight);
9821       else
9822         {
9823           int id = XINT (AREF (initial, 1));
9824
9825           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9826                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9827                        || id < 0)
9828                       ? coding_category_iso_8_else
9829                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9830                       ? coding_category_iso_8_1
9831                       : coding_category_iso_8_2);
9832         }
9833       if (category != coding_category_iso_8_1
9834           && category != coding_category_iso_8_2)
9835         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9836     }
9837   else if (EQ (coding_type, Qemacs_mule))
9838     {
9839       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9840         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9841       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9842       category = coding_category_emacs_mule;
9843     }
9844   else if (EQ (coding_type, Qshift_jis))
9845     {
9846
9847       struct charset *charset;
9848
9849       if (XINT (Flength (charset_list)) != 3
9850           && XINT (Flength (charset_list)) != 4)
9851         error ("There should be three or four charsets");
9852
9853       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9854       if (CHARSET_DIMENSION (charset) != 1)
9855         error ("Dimension of charset %s is not one",
9856                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9857       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9858         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9859
9860       charset_list = XCDR (charset_list);
9861       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9862       if (CHARSET_DIMENSION (charset) != 1)
9863         error ("Dimension of charset %s is not one",
9864                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9865
9866       charset_list = XCDR (charset_list);
9867       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9868       if (CHARSET_DIMENSION (charset) != 2)
9869         error ("Dimension of charset %s is not two",
9870                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9871
9872       charset_list = XCDR (charset_list);
9873       if (! NILP (charset_list))
9874         {
9875           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9876           if (CHARSET_DIMENSION (charset) != 2)
9877             error ("Dimension of charset %s is not two",
9878                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9879         }
9880
9881       category = coding_category_sjis;
9882       Vsjis_coding_system = name;
9883     }
9884   else if (EQ (coding_type, Qbig5))
9885     {
9886       struct charset *charset;
9887
9888       if (XINT (Flength (charset_list)) != 2)
9889         error ("There should be just two charsets");
9890
9891       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9892       if (CHARSET_DIMENSION (charset) != 1)
9893         error ("Dimension of charset %s is not one",
9894                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9895       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9896         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9897
9898       charset_list = XCDR (charset_list);
9899       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9900       if (CHARSET_DIMENSION (charset) != 2)
9901         error ("Dimension of charset %s is not two",
9902                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9903
9904       category = coding_category_big5;
9905       Vbig5_coding_system = name;
9906     }
9907   else if (EQ (coding_type, Qraw_text))
9908     {
9909       category = coding_category_raw_text;
9910       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9911     }
9912   else if (EQ (coding_type, Qutf_8))
9913     {
9914       Lisp_Object bom;
9915
9916       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9917
9918       if (nargs < coding_arg_utf8_max)
9919         goto short_args;
9920
9921       bom = args[coding_arg_utf8_bom];
9922       if (! NILP (bom) && ! EQ (bom, Qt))
9923         {
9924           CHECK_CONS (bom);
9925           val = XCAR (bom);
9926           CHECK_CODING_SYSTEM (val);
9927           val = XCDR (bom);
9928           CHECK_CODING_SYSTEM (val);
9929         }
9930       ASET (attrs, coding_attr_utf_bom, bom);
9931
9932       category = (CONSP (bom) ? coding_category_utf_8_auto
9933                   : NILP (bom) ? coding_category_utf_8_nosig
9934                   : coding_category_utf_8_sig);
9935     }
9936   else if (EQ (coding_type, Qundecided))
9937     category = coding_category_undecided;
9938   else
9939     error ("Invalid coding system type: %s",
9940            SDATA (SYMBOL_NAME (coding_type)));
9941
9942   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9943   CODING_ATTR_PLIST (attrs)
9944     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9945                                 CODING_ATTR_PLIST (attrs)));
9946   CODING_ATTR_PLIST (attrs)
9947     = Fcons (QCascii_compatible_p,
9948              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9949                     CODING_ATTR_PLIST (attrs)));
9950
9951   eol_type = args[coding_arg_eol_type];
9952   if (! NILP (eol_type)
9953       && ! EQ (eol_type, Qunix)
9954       && ! EQ (eol_type, Qdos)
9955       && ! EQ (eol_type, Qmac))
9956     error ("Invalid eol-type");
9957
9958   aliases = Fcons (name, Qnil);
9959
9960   if (NILP (eol_type))
9961     {
9962       eol_type = make_subsidiaries (name);
9963       for (i = 0; i < 3; i++)
9964         {
9965           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9966
9967           this_name = AREF (eol_type, i);
9968           this_aliases = Fcons (this_name, Qnil);
9969           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9970           this_spec = Fmake_vector (make_number (3), attrs);
9971           ASET (this_spec, 1, this_aliases);
9972           ASET (this_spec, 2, this_eol_type);
9973           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9974           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9975           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9976           if (NILP (val))
9977             Vcoding_system_alist
9978               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9979                        Vcoding_system_alist);
9980         }
9981     }
9982
9983   spec_vec = Fmake_vector (make_number (3), attrs);
9984   ASET (spec_vec, 1, aliases);
9985   ASET (spec_vec, 2, eol_type);
9986
9987   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9988   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9989   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9990   if (NILP (val))
9991     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9992                                   Vcoding_system_alist);
9993
9994   {
9995     int id = coding_categories[category].id;
9996
9997     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9998       setup_coding_system (name, &coding_categories[category]);
9999   }
10000
10001   return Qnil;
10002
10003  short_args:
10004   return Fsignal (Qwrong_number_of_arguments,
10005                   Fcons (intern ("define-coding-system-internal"),
10006                          make_number (nargs)));
10007 }
10008
10009
10010 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10011        3, 3, 0,
10012        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10013   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10014 {
10015   Lisp_Object spec, attrs;
10016
10017   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10018   attrs = AREF (spec, 0);
10019   if (EQ (prop, QCmnemonic))
10020     {
10021       if (! STRINGP (val))
10022         CHECK_CHARACTER (val);
10023       CODING_ATTR_MNEMONIC (attrs) = val;
10024     }
10025   else if (EQ (prop, QCdefault_char))
10026     {
10027       if (NILP (val))
10028         val = make_number (' ');
10029       else
10030         CHECK_CHARACTER (val);
10031       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10032     }
10033   else if (EQ (prop, QCdecode_translation_table))
10034     {
10035       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10036         CHECK_SYMBOL (val);
10037       CODING_ATTR_DECODE_TBL (attrs) = val;
10038     }
10039   else if (EQ (prop, QCencode_translation_table))
10040     {
10041       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10042         CHECK_SYMBOL (val);
10043       CODING_ATTR_ENCODE_TBL (attrs) = val;
10044     }
10045   else if (EQ (prop, QCpost_read_conversion))
10046     {
10047       CHECK_SYMBOL (val);
10048       CODING_ATTR_POST_READ (attrs) = val;
10049     }
10050   else if (EQ (prop, QCpre_write_conversion))
10051     {
10052       CHECK_SYMBOL (val);
10053       CODING_ATTR_PRE_WRITE (attrs) = val;
10054     }
10055   else if (EQ (prop, QCascii_compatible_p))
10056     {
10057       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10058     }
10059
10060   CODING_ATTR_PLIST (attrs)
10061     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10062   return val;
10063 }
10064
10065
10066 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10067        Sdefine_coding_system_alias, 2, 2, 0,
10068        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10069   (Lisp_Object alias, Lisp_Object coding_system)
10070 {
10071   Lisp_Object spec, aliases, eol_type, val;
10072
10073   CHECK_SYMBOL (alias);
10074   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10075   aliases = AREF (spec, 1);
10076   /* ALIASES should be a list of length more than zero, and the first
10077      element is a base coding system.  Append ALIAS at the tail of the
10078      list.  */
10079   while (!NILP (XCDR (aliases)))
10080     aliases = XCDR (aliases);
10081   XSETCDR (aliases, Fcons (alias, Qnil));
10082
10083   eol_type = AREF (spec, 2);
10084   if (VECTORP (eol_type))
10085     {
10086       Lisp_Object subsidiaries;
10087       int i;
10088
10089       subsidiaries = make_subsidiaries (alias);
10090       for (i = 0; i < 3; i++)
10091         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10092                                      AREF (eol_type, i));
10093     }
10094
10095   Fputhash (alias, spec, Vcoding_system_hash_table);
10096   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10097   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10098   if (NILP (val))
10099     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10100                                   Vcoding_system_alist);
10101
10102   return Qnil;
10103 }
10104
10105 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10106        1, 1, 0,
10107        doc: /* Return the base of CODING-SYSTEM.
10108 Any alias or subsidiary coding system is not a base coding system.  */)
10109   (Lisp_Object coding_system)
10110 {
10111   Lisp_Object spec, attrs;
10112
10113   if (NILP (coding_system))
10114     return (Qno_conversion);
10115   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10116   attrs = AREF (spec, 0);
10117   return CODING_ATTR_BASE_NAME (attrs);
10118 }
10119
10120 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10121        1, 1, 0,
10122        doc: "Return the property list of CODING-SYSTEM.")
10123   (Lisp_Object coding_system)
10124 {
10125   Lisp_Object spec, attrs;
10126
10127   if (NILP (coding_system))
10128     coding_system = Qno_conversion;
10129   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10130   attrs = AREF (spec, 0);
10131   return CODING_ATTR_PLIST (attrs);
10132 }
10133
10134
10135 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10136        1, 1, 0,
10137        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10138   (Lisp_Object coding_system)
10139 {
10140   Lisp_Object spec;
10141
10142   if (NILP (coding_system))
10143     coding_system = Qno_conversion;
10144   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10145   return AREF (spec, 1);
10146 }
10147
10148 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10149        Scoding_system_eol_type, 1, 1, 0,
10150        doc: /* Return eol-type of CODING-SYSTEM.
10151 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10152
10153 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10154 and CR respectively.
10155
10156 A vector value indicates that a format of end-of-line should be
10157 detected automatically.  Nth element of the vector is the subsidiary
10158 coding system whose eol-type is N.  */)
10159   (Lisp_Object coding_system)
10160 {
10161   Lisp_Object spec, eol_type;
10162   int n;
10163
10164   if (NILP (coding_system))
10165     coding_system = Qno_conversion;
10166   if (! CODING_SYSTEM_P (coding_system))
10167     return Qnil;
10168   spec = CODING_SYSTEM_SPEC (coding_system);
10169   eol_type = AREF (spec, 2);
10170   if (VECTORP (eol_type))
10171     return Fcopy_sequence (eol_type);
10172   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10173   return make_number (n);
10174 }
10175
10176 #endif /* emacs */
10177
10178 \f
10179 /*** 9. Post-amble ***/
10180
10181 void
10182 init_coding_once (void)
10183 {
10184   int i;
10185
10186   for (i = 0; i < coding_category_max; i++)
10187     {
10188       coding_categories[i].id = -1;
10189       coding_priorities[i] = i;
10190     }
10191
10192   /* ISO2022 specific initialize routine.  */
10193   for (i = 0; i < 0x20; i++)
10194     iso_code_class[i] = ISO_control_0;
10195   for (i = 0x21; i < 0x7F; i++)
10196     iso_code_class[i] = ISO_graphic_plane_0;
10197   for (i = 0x80; i < 0xA0; i++)
10198     iso_code_class[i] = ISO_control_1;
10199   for (i = 0xA1; i < 0xFF; i++)
10200     iso_code_class[i] = ISO_graphic_plane_1;
10201   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10202   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10203   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10204   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10205   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10206   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10207   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10208   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10209   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10210
10211   for (i = 0; i < 256; i++)
10212     {
10213       emacs_mule_bytes[i] = 1;
10214     }
10215   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10216   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10217   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10218   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10219 }
10220
10221 #ifdef emacs
10222
10223 void
10224 syms_of_coding (void)
10225 {
10226   staticpro (&Vcoding_system_hash_table);
10227   {
10228     Lisp_Object args[2];
10229     args[0] = QCtest;
10230     args[1] = Qeq;
10231     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10232   }
10233
10234   staticpro (&Vsjis_coding_system);
10235   Vsjis_coding_system = Qnil;
10236
10237   staticpro (&Vbig5_coding_system);
10238   Vbig5_coding_system = Qnil;
10239
10240   staticpro (&Vcode_conversion_reused_workbuf);
10241   Vcode_conversion_reused_workbuf = Qnil;
10242
10243   staticpro (&Vcode_conversion_workbuf_name);
10244   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10245
10246   reused_workbuf_in_use = 0;
10247
10248   DEFSYM (Qcharset, "charset");
10249   DEFSYM (Qtarget_idx, "target-idx");
10250   DEFSYM (Qcoding_system_history, "coding-system-history");
10251   Fset (Qcoding_system_history, Qnil);
10252
10253   /* Target FILENAME is the first argument.  */
10254   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10255   /* Target FILENAME is the third argument.  */
10256   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10257
10258   DEFSYM (Qcall_process, "call-process");
10259   /* Target PROGRAM is the first argument.  */
10260   Fput (Qcall_process, Qtarget_idx, make_number (0));
10261
10262   DEFSYM (Qcall_process_region, "call-process-region");
10263   /* Target PROGRAM is the third argument.  */
10264   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10265
10266   DEFSYM (Qstart_process, "start-process");
10267   /* Target PROGRAM is the third argument.  */
10268   Fput (Qstart_process, Qtarget_idx, make_number (2));
10269
10270   DEFSYM (Qopen_network_stream, "open-network-stream");
10271   /* Target SERVICE is the fourth argument.  */
10272   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10273
10274   DEFSYM (Qcoding_system, "coding-system");
10275   DEFSYM (Qcoding_aliases, "coding-aliases");
10276
10277   DEFSYM (Qeol_type, "eol-type");
10278   DEFSYM (Qunix, "unix");
10279   DEFSYM (Qdos, "dos");
10280
10281   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10282   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10283   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10284   DEFSYM (Qdefault_char, "default-char");
10285   DEFSYM (Qundecided, "undecided");
10286   DEFSYM (Qno_conversion, "no-conversion");
10287   DEFSYM (Qraw_text, "raw-text");
10288
10289   DEFSYM (Qiso_2022, "iso-2022");
10290
10291   DEFSYM (Qutf_8, "utf-8");
10292   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10293
10294   DEFSYM (Qutf_16, "utf-16");
10295   DEFSYM (Qbig, "big");
10296   DEFSYM (Qlittle, "little");
10297
10298   DEFSYM (Qshift_jis, "shift-jis");
10299   DEFSYM (Qbig5, "big5");
10300
10301   DEFSYM (Qcoding_system_p, "coding-system-p");
10302
10303   DEFSYM (Qcoding_system_error, "coding-system-error");
10304   Fput (Qcoding_system_error, Qerror_conditions,
10305         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10306   Fput (Qcoding_system_error, Qerror_message,
10307         make_pure_c_string ("Invalid coding system"));
10308
10309   /* Intern this now in case it isn't already done.
10310      Setting this variable twice is harmless.
10311      But don't staticpro it here--that is done in alloc.c.  */
10312   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10313
10314   DEFSYM (Qtranslation_table, "translation-table");
10315   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10316   DEFSYM (Qtranslation_table_id, "translation-table-id");
10317   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10318   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10319
10320   DEFSYM (Qvalid_codes, "valid-codes");
10321
10322   DEFSYM (Qemacs_mule, "emacs-mule");
10323
10324   DEFSYM (QCcategory, ":category");
10325   DEFSYM (QCmnemonic, ":mnemonic");
10326   DEFSYM (QCdefault_char, ":default-char");
10327   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10328   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10329   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10330   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10331   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10332
10333   Vcoding_category_table
10334     = Fmake_vector (make_number (coding_category_max), Qnil);
10335   staticpro (&Vcoding_category_table);
10336   /* Followings are target of code detection.  */
10337   ASET (Vcoding_category_table, coding_category_iso_7,
10338         intern_c_string ("coding-category-iso-7"));
10339   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10340         intern_c_string ("coding-category-iso-7-tight"));
10341   ASET (Vcoding_category_table, coding_category_iso_8_1,
10342         intern_c_string ("coding-category-iso-8-1"));
10343   ASET (Vcoding_category_table, coding_category_iso_8_2,
10344         intern_c_string ("coding-category-iso-8-2"));
10345   ASET (Vcoding_category_table, coding_category_iso_7_else,
10346         intern_c_string ("coding-category-iso-7-else"));
10347   ASET (Vcoding_category_table, coding_category_iso_8_else,
10348         intern_c_string ("coding-category-iso-8-else"));
10349   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10350         intern_c_string ("coding-category-utf-8-auto"));
10351   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10352         intern_c_string ("coding-category-utf-8"));
10353   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10354         intern_c_string ("coding-category-utf-8-sig"));
10355   ASET (Vcoding_category_table, coding_category_utf_16_be,
10356         intern_c_string ("coding-category-utf-16-be"));
10357   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10358         intern_c_string ("coding-category-utf-16-auto"));
10359   ASET (Vcoding_category_table, coding_category_utf_16_le,
10360         intern_c_string ("coding-category-utf-16-le"));
10361   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10362         intern_c_string ("coding-category-utf-16-be-nosig"));
10363   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10364         intern_c_string ("coding-category-utf-16-le-nosig"));
10365   ASET (Vcoding_category_table, coding_category_charset,
10366         intern_c_string ("coding-category-charset"));
10367   ASET (Vcoding_category_table, coding_category_sjis,
10368         intern_c_string ("coding-category-sjis"));
10369   ASET (Vcoding_category_table, coding_category_big5,
10370         intern_c_string ("coding-category-big5"));
10371   ASET (Vcoding_category_table, coding_category_ccl,
10372         intern_c_string ("coding-category-ccl"));
10373   ASET (Vcoding_category_table, coding_category_emacs_mule,
10374         intern_c_string ("coding-category-emacs-mule"));
10375   /* Followings are NOT target of code detection.  */
10376   ASET (Vcoding_category_table, coding_category_raw_text,
10377         intern_c_string ("coding-category-raw-text"));
10378   ASET (Vcoding_category_table, coding_category_undecided,
10379         intern_c_string ("coding-category-undecided"));
10380
10381   DEFSYM (Qinsufficient_source, "insufficient-source");
10382   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10383   DEFSYM (Qinvalid_source, "invalid-source");
10384   DEFSYM (Qinterrupted, "interrupted");
10385   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10386   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10387
10388   defsubr (&Scoding_system_p);
10389   defsubr (&Sread_coding_system);
10390   defsubr (&Sread_non_nil_coding_system);
10391   defsubr (&Scheck_coding_system);
10392   defsubr (&Sdetect_coding_region);
10393   defsubr (&Sdetect_coding_string);
10394   defsubr (&Sfind_coding_systems_region_internal);
10395   defsubr (&Sunencodable_char_position);
10396   defsubr (&Scheck_coding_systems_region);
10397   defsubr (&Sdecode_coding_region);
10398   defsubr (&Sencode_coding_region);
10399   defsubr (&Sdecode_coding_string);
10400   defsubr (&Sencode_coding_string);
10401   defsubr (&Sdecode_sjis_char);
10402   defsubr (&Sencode_sjis_char);
10403   defsubr (&Sdecode_big5_char);
10404   defsubr (&Sencode_big5_char);
10405   defsubr (&Sset_terminal_coding_system_internal);
10406   defsubr (&Sset_safe_terminal_coding_system_internal);
10407   defsubr (&Sterminal_coding_system);
10408   defsubr (&Sset_keyboard_coding_system_internal);
10409   defsubr (&Skeyboard_coding_system);
10410   defsubr (&Sfind_operation_coding_system);
10411   defsubr (&Sset_coding_system_priority);
10412   defsubr (&Sdefine_coding_system_internal);
10413   defsubr (&Sdefine_coding_system_alias);
10414   defsubr (&Scoding_system_put);
10415   defsubr (&Scoding_system_base);
10416   defsubr (&Scoding_system_plist);
10417   defsubr (&Scoding_system_aliases);
10418   defsubr (&Scoding_system_eol_type);
10419   defsubr (&Scoding_system_priority_list);
10420
10421   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10422                doc: /* List of coding systems.
10423
10424 Do not alter the value of this variable manually.  This variable should be
10425 updated by the functions `define-coding-system' and
10426 `define-coding-system-alias'.  */);
10427   Vcoding_system_list = Qnil;
10428
10429   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10430                doc: /* Alist of coding system names.
10431 Each element is one element list of coding system name.
10432 This variable is given to `completing-read' as COLLECTION argument.
10433
10434 Do not alter the value of this variable manually.  This variable should be
10435 updated by the functions `make-coding-system' and
10436 `define-coding-system-alias'.  */);
10437   Vcoding_system_alist = Qnil;
10438
10439   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10440                doc: /* List of coding-categories (symbols) ordered by priority.
10441
10442 On detecting a coding system, Emacs tries code detection algorithms
10443 associated with each coding-category one by one in this order.  When
10444 one algorithm agrees with a byte sequence of source text, the coding
10445 system bound to the corresponding coding-category is selected.
10446
10447 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10448   {
10449     int i;
10450
10451     Vcoding_category_list = Qnil;
10452     for (i = coding_category_max - 1; i >= 0; i--)
10453       Vcoding_category_list
10454         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10455                  Vcoding_category_list);
10456   }
10457
10458   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10459                doc: /* Specify the coding system for read operations.
10460 It is useful to bind this variable with `let', but do not set it globally.
10461 If the value is a coding system, it is used for decoding on read operation.
10462 If not, an appropriate element is used from one of the coding system alists.
10463 There are three such tables: `file-coding-system-alist',
10464 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10465   Vcoding_system_for_read = Qnil;
10466
10467   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10468                doc: /* Specify the coding system for write operations.
10469 Programs bind this variable with `let', but you should not set it globally.
10470 If the value is a coding system, it is used for encoding of output,
10471 when writing it to a file and when sending it to a file or subprocess.
10472
10473 If this does not specify a coding system, an appropriate element
10474 is used from one of the coding system alists.
10475 There are three such tables: `file-coding-system-alist',
10476 `process-coding-system-alist', and `network-coding-system-alist'.
10477 For output to files, if the above procedure does not specify a coding system,
10478 the value of `buffer-file-coding-system' is used.  */);
10479   Vcoding_system_for_write = Qnil;
10480
10481   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10482                doc: /*
10483 Coding system used in the latest file or process I/O.  */);
10484   Vlast_coding_system_used = Qnil;
10485
10486   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10487                doc: /*
10488 Error status of the last code conversion.
10489
10490 When an error was detected in the last code conversion, this variable
10491 is set to one of the following symbols.
10492   `insufficient-source'
10493   `inconsistent-eol'
10494   `invalid-source'
10495   `interrupted'
10496   `insufficient-memory'
10497 When no error was detected, the value doesn't change.  So, to check
10498 the error status of a code conversion by this variable, you must
10499 explicitly set this variable to nil before performing code
10500 conversion.  */);
10501   Vlast_code_conversion_error = Qnil;
10502
10503   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10504                doc: /*
10505 *Non-nil means always inhibit code conversion of end-of-line format.
10506 See info node `Coding Systems' and info node `Text and Binary' concerning
10507 such conversion.  */);
10508   inhibit_eol_conversion = 0;
10509
10510   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10511                doc: /*
10512 Non-nil means process buffer inherits coding system of process output.
10513 Bind it to t if the process output is to be treated as if it were a file
10514 read from some filesystem.  */);
10515   inherit_process_coding_system = 0;
10516
10517   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10518                doc: /*
10519 Alist to decide a coding system to use for a file I/O operation.
10520 The format is ((PATTERN . VAL) ...),
10521 where PATTERN is a regular expression matching a file name,
10522 VAL is a coding system, a cons of coding systems, or a function symbol.
10523 If VAL is a coding system, it is used for both decoding and encoding
10524 the file contents.
10525 If VAL is a cons of coding systems, the car part is used for decoding,
10526 and the cdr part is used for encoding.
10527 If VAL is a function symbol, the function must return a coding system
10528 or a cons of coding systems which are used as above.  The function is
10529 called with an argument that is a list of the arguments with which
10530 `find-operation-coding-system' was called.  If the function can't decide
10531 a coding system, it can return `undecided' so that the normal
10532 code-detection is performed.
10533
10534 See also the function `find-operation-coding-system'
10535 and the variable `auto-coding-alist'.  */);
10536   Vfile_coding_system_alist = Qnil;
10537
10538   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10539                doc: /*
10540 Alist to decide a coding system to use for a process I/O operation.
10541 The format is ((PATTERN . VAL) ...),
10542 where PATTERN is a regular expression matching a program name,
10543 VAL is a coding system, a cons of coding systems, or a function symbol.
10544 If VAL is a coding system, it is used for both decoding what received
10545 from the program and encoding what sent to the program.
10546 If VAL is a cons of coding systems, the car part is used for decoding,
10547 and the cdr part is used for encoding.
10548 If VAL is a function symbol, the function must return a coding system
10549 or a cons of coding systems which are used as above.
10550
10551 See also the function `find-operation-coding-system'.  */);
10552   Vprocess_coding_system_alist = Qnil;
10553
10554   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10555                doc: /*
10556 Alist to decide a coding system to use for a network I/O operation.
10557 The format is ((PATTERN . VAL) ...),
10558 where PATTERN is a regular expression matching a network service name
10559 or is a port number to connect to,
10560 VAL is a coding system, a cons of coding systems, or a function symbol.
10561 If VAL is a coding system, it is used for both decoding what received
10562 from the network stream and encoding what sent to the network stream.
10563 If VAL is a cons of coding systems, the car part is used for decoding,
10564 and the cdr part is used for encoding.
10565 If VAL is a function symbol, the function must return a coding system
10566 or a cons of coding systems which are used as above.
10567
10568 See also the function `find-operation-coding-system'.  */);
10569   Vnetwork_coding_system_alist = Qnil;
10570
10571   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10572                doc: /* Coding system to use with system messages.
10573 Also used for decoding keyboard input on X Window system.  */);
10574   Vlocale_coding_system = Qnil;
10575
10576   /* The eol mnemonics are reset in startup.el system-dependently.  */
10577   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10578                doc: /*
10579 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10580   eol_mnemonic_unix = make_pure_c_string (":");
10581
10582   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10583                doc: /*
10584 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10585   eol_mnemonic_dos = make_pure_c_string ("\\");
10586
10587   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10588                doc: /*
10589 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10590   eol_mnemonic_mac = make_pure_c_string ("/");
10591
10592   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10593                doc: /*
10594 *String displayed in mode line when end-of-line format is not yet determined.  */);
10595   eol_mnemonic_undecided = make_pure_c_string (":");
10596
10597   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10598                doc: /*
10599 *Non-nil enables character translation while encoding and decoding.  */);
10600   Venable_character_translation = Qt;
10601
10602   DEFVAR_LISP ("standard-translation-table-for-decode",
10603                Vstandard_translation_table_for_decode,
10604                doc: /* Table for translating characters while decoding.  */);
10605   Vstandard_translation_table_for_decode = Qnil;
10606
10607   DEFVAR_LISP ("standard-translation-table-for-encode",
10608                Vstandard_translation_table_for_encode,
10609                doc: /* Table for translating characters while encoding.  */);
10610   Vstandard_translation_table_for_encode = Qnil;
10611
10612   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10613                doc: /* Alist of charsets vs revision numbers.
10614 While encoding, if a charset (car part of an element) is found,
10615 designate it with the escape sequence identifying revision (cdr part
10616 of the element).  */);
10617   Vcharset_revision_table = Qnil;
10618
10619   DEFVAR_LISP ("default-process-coding-system",
10620                Vdefault_process_coding_system,
10621                doc: /* Cons of coding systems used for process I/O by default.
10622 The car part is used for decoding a process output,
10623 the cdr part is used for encoding a text to be sent to a process.  */);
10624   Vdefault_process_coding_system = Qnil;
10625
10626   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10627                doc: /*
10628 Table of extra Latin codes in the range 128..159 (inclusive).
10629 This is a vector of length 256.
10630 If Nth element is non-nil, the existence of code N in a file
10631 \(or output of subprocess) doesn't prevent it to be detected as
10632 a coding system of ISO 2022 variant which has a flag
10633 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10634 or reading output of a subprocess.
10635 Only 128th through 159th elements have a meaning.  */);
10636   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10637
10638   DEFVAR_LISP ("select-safe-coding-system-function",
10639                Vselect_safe_coding_system_function,
10640                doc: /*
10641 Function to call to select safe coding system for encoding a text.
10642
10643 If set, this function is called to force a user to select a proper
10644 coding system which can encode the text in the case that a default
10645 coding system used in each operation can't encode the text.  The
10646 function should take care that the buffer is not modified while
10647 the coding system is being selected.
10648
10649 The default value is `select-safe-coding-system' (which see).  */);
10650   Vselect_safe_coding_system_function = Qnil;
10651
10652   DEFVAR_BOOL ("coding-system-require-warning",
10653                coding_system_require_warning,
10654                doc: /* Internal use only.
10655 If non-nil, on writing a file, `select-safe-coding-system-function' is
10656 called even if `coding-system-for-write' is non-nil.  The command
10657 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10658   coding_system_require_warning = 0;
10659
10660
10661   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10662                inhibit_iso_escape_detection,
10663                doc: /*
10664 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10665
10666 When Emacs reads text, it tries to detect how the text is encoded.
10667 This code detection is sensitive to escape sequences.  If Emacs sees
10668 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10669 of the ISO2022 encodings, and decodes text by the corresponding coding
10670 system (e.g. `iso-2022-7bit').
10671
10672 However, there may be a case that you want to read escape sequences in
10673 a file as is.  In such a case, you can set this variable to non-nil.
10674 Then the code detection will ignore any escape sequences, and no text is
10675 detected as encoded in some ISO-2022 encoding.  The result is that all
10676 escape sequences become visible in a buffer.
10677
10678 The default value is nil, and it is strongly recommended not to change
10679 it.  That is because many Emacs Lisp source files that contain
10680 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10681 in Emacs's distribution, and they won't be decoded correctly on
10682 reading if you suppress escape sequence detection.
10683
10684 The other way to read escape sequences in a file without decoding is
10685 to explicitly specify some coding system that doesn't use ISO-2022
10686 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10687   inhibit_iso_escape_detection = 0;
10688
10689   DEFVAR_BOOL ("inhibit-null-byte-detection",
10690                inhibit_null_byte_detection,
10691                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10692 By default, Emacs treats it as binary data, and does not attempt to
10693 decode it.  The effect is as if you specified `no-conversion' for
10694 reading that text.
10695
10696 Set this to non-nil when a regular text happens to include null bytes.
10697 Examples are Index nodes of Info files and null-byte delimited output
10698 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10699 decode text as usual.  */);
10700   inhibit_null_byte_detection = 0;
10701
10702   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10703                doc: /* Char table for translating self-inserting characters.
10704 This is applied to the result of input methods, not their input.
10705 See also `keyboard-translate-table'.
10706
10707 Use of this variable for character code unification was rendered
10708 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10709 internal character representation.  */);
10710     Vtranslation_table_for_input = Qnil;
10711
10712   {
10713     Lisp_Object args[coding_arg_max];
10714     Lisp_Object plist[16];
10715     int i;
10716
10717     for (i = 0; i < coding_arg_max; i++)
10718       args[i] = Qnil;
10719
10720     plist[0] = intern_c_string (":name");
10721     plist[1] = args[coding_arg_name] = Qno_conversion;
10722     plist[2] = intern_c_string (":mnemonic");
10723     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10724     plist[4] = intern_c_string (":coding-type");
10725     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10726     plist[6] = intern_c_string (":ascii-compatible-p");
10727     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10728     plist[8] = intern_c_string (":default-char");
10729     plist[9] = args[coding_arg_default_char] = make_number (0);
10730     plist[10] = intern_c_string (":for-unibyte");
10731     plist[11] = args[coding_arg_for_unibyte] = Qt;
10732     plist[12] = intern_c_string (":docstring");
10733     plist[13] = make_pure_c_string ("Do no conversion.\n\
10734 \n\
10735 When you visit a file with this coding, the file is read into a\n\
10736 unibyte buffer as is, thus each byte of a file is treated as a\n\
10737 character.");
10738     plist[14] = intern_c_string (":eol-type");
10739     plist[15] = args[coding_arg_eol_type] = Qunix;
10740     args[coding_arg_plist] = Flist (16, plist);
10741     Fdefine_coding_system_internal (coding_arg_max, args);
10742
10743     plist[1] = args[coding_arg_name] = Qundecided;
10744     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10745     plist[5] = args[coding_arg_coding_type] = Qundecided;
10746     /* This is already set.
10747        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10748     plist[8] = intern_c_string (":charset-list");
10749     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10750     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10751     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10752     plist[15] = args[coding_arg_eol_type] = Qnil;
10753     args[coding_arg_plist] = Flist (16, plist);
10754     Fdefine_coding_system_internal (coding_arg_max, args);
10755   }
10756
10757   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10758
10759   {
10760     int i;
10761
10762     for (i = 0; i < coding_category_max; i++)
10763       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10764   }
10765 #if defined (DOS_NT)
10766   system_eol_type = Qdos;
10767 #else
10768   system_eol_type = Qunix;
10769 #endif
10770   staticpro (&system_eol_type);
10771 }
10772
10773 char *
10774 emacs_strerror (int error_number)
10775 {
10776   char *str;
10777
10778   synchronize_system_messages_locale ();
10779   str = strerror (error_number);
10780
10781   if (! NILP (Vlocale_coding_system))
10782     {
10783       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10784                                                       Vlocale_coding_system,
10785                                                       0);
10786       str = SSDATA (dec);
10787     }
10788
10789   return str;
10790 }
10791
10792 #endif /* emacs */