src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /*** Commonly used macros and functions ***/
 646
 647 #ifndef min
 648 #define min(a, b) ((a) < (b) ? (a) : (b))
 649 #endif
 650 #ifndef max
 651 #define max(a, b) ((a) > (b) ? (a) : (b))
 652 #endif
 653
 654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 655
 656 static int
 657 encode_inhibit_flag (Lisp_Object flag)
 658 {
 659   return NILP (flag) ? -1 : EQ (flag, Qt);
 660 }
 661
 662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 663    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 664
 665 static bool
 666 inhibit_flag (int encoded_flag, bool var)
 667 {
 668   return 0 < encoded_flag + var;
 669 }
 670
 671 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 672   do {                                                  \
 673     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 674     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 675   } while (0)
 676
 677 static void
 678 CHECK_NATNUM_CAR (Lisp_Object x)
 679 {
 680   Lisp_Object tmp = XCAR (x);
 681   CHECK_NATNUM (tmp);
 682   XSETCAR (x, tmp);
 683 }
 684
 685 static void
 686 CHECK_NATNUM_CDR (Lisp_Object x)
 687 {
 688   Lisp_Object tmp = XCDR (x);
 689   CHECK_NATNUM (tmp);
 690   XSETCDR (x, tmp);
 691 }
 692
 693
 694 /* Safely get one byte from the source text pointed by SRC which ends
 695    at SRC_END, and set C to that byte.  If there are not enough bytes
 696    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 697    and a multibyte character is found at SRC, set C to the
 698    negative value of the character code.  The caller should declare
 699    and set these variables appropriately in advance:
 700         src, src_end, multibytep */
 701
 702 #define ONE_MORE_BYTE(c)                                \
 703   do {                                                  \
 704     if (src == src_end)                                 \
 705       {                                                 \
 706         if (src_base < src)                             \
 707           record_conversion_result                      \
 708             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 709         goto no_more_source;                            \
 710       }                                                 \
 711     c = *src++;                                         \
 712     if (multibytep && (c & 0x80))                       \
 713       {                                                 \
 714         if ((c & 0xFE) == 0xC0)                         \
 715           c = ((c & 1) << 6) | *src++;                  \
 716         else                                            \
 717           {                                             \
 718             src--;                                      \
 719             c = - string_char (src, &src, NULL);        \
 720             record_conversion_result                    \
 721               (coding, CODING_RESULT_INVALID_SRC);      \
 722           }                                             \
 723       }                                                 \
 724     consumed_chars++;                                   \
 725   } while (0)
 726
 727 /* Safely get two bytes from the source text pointed by SRC which ends
 728    at SRC_END, and set C1 and C2 to those bytes while skipping the
 729    heading multibyte characters.  If there are not enough bytes in the
 730    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 731    a multibyte character is found for C2, set C2 to the negative value
 732    of the character code.  The caller should declare and set these
 733    variables appropriately in advance:
 734         src, src_end, multibytep
 735    It is intended that this macro is used in detect_coding_utf_16.  */
 736
 737 #define TWO_MORE_BYTES(c1, c2)                          \
 738   do {                                                  \
 739     do {                                                \
 740       if (src == src_end)                               \
 741         goto no_more_source;                            \
 742       c1 = *src++;                                      \
 743       if (multibytep && (c1 & 0x80))                    \
 744         {                                               \
 745           if ((c1 & 0xFE) == 0xC0)                      \
 746             c1 = ((c1 & 1) << 6) | *src++;              \
 747           else                                          \
 748             {                                           \
 749               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 750               c1 = -1;                                  \
 751             }                                           \
 752         }                                               \
 753     } while (c1 < 0);                                   \
 754     if (src == src_end)                                 \
 755       goto no_more_source;                              \
 756     c2 = *src++;                                        \
 757     if (multibytep && (c2 & 0x80))                      \
 758       {                                                 \
 759         if ((c2 & 0xFE) == 0xC0)                        \
 760           c2 = ((c2 & 1) << 6) | *src++;                \
 761         else                                            \
 762           c2 = -1;                                      \
 763       }                                                 \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  The caller should
 769    assure that C is 0..127, and declare and set the variable `dst'
 770    appropriately in advance.
 771 */
 772
 773
 774 #define EMIT_ONE_ASCII_BYTE(c)  \
 775   do {                          \
 776     produced_chars++;           \
 777     *dst++ = (c);               \
 778   } while (0)
 779
 780
 781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 782
 783 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 784   do {                                  \
 785     produced_chars += 2;                \
 786     *dst++ = (c1), *dst++ = (c2);       \
 787   } while (0)
 788
 789
 790 /* Store a byte C in the place pointed by DST and increment DST to the
 791    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 792    store in an appropriate multibyte form.  The caller should
 793    declare and set the variables `dst' and `multibytep' appropriately
 794    in advance.  */
 795
 796 #define EMIT_ONE_BYTE(c)                \
 797   do {                                  \
 798     produced_chars++;                   \
 799     if (multibytep)                     \
 800       {                                 \
 801         unsigned ch = (c);              \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       *dst++ = (c);                     \
 808   } while (0)
 809
 810
 811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 812
 813 #define EMIT_TWO_BYTES(c1, c2)          \
 814   do {                                  \
 815     produced_chars += 2;                \
 816     if (multibytep)                     \
 817       {                                 \
 818         unsigned ch;                    \
 819                                         \
 820         ch = (c1);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824         ch = (c2);                      \
 825         if (ch >= 0x80)                 \
 826           ch = BYTE8_TO_CHAR (ch);      \
 827         CHAR_STRING_ADVANCE (ch, dst);  \
 828       }                                 \
 829     else                                \
 830       {                                 \
 831         *dst++ = (c1);                  \
 832         *dst++ = (c2);                  \
 833       }                                 \
 834   } while (0)
 835
 836
 837 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 838   do {                                  \
 839     EMIT_ONE_BYTE (c1);                 \
 840     EMIT_TWO_BYTES (c2, c3);            \
 841   } while (0)
 842
 843
 844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 845   do {                                          \
 846     EMIT_TWO_BYTES (c1, c2);                    \
 847     EMIT_TWO_BYTES (c3, c4);                    \
 848   } while (0)
 849
 850
 851 static void
 852 record_conversion_result (struct coding_system *coding,
 853                           enum coding_result_code result)
 854 {
 855   coding->result = result;
 856   switch (result)
 857     {
 858     case CODING_RESULT_INSUFFICIENT_SRC:
 859       Vlast_code_conversion_error = Qinsufficient_source;
 860       break;
 861     case CODING_RESULT_INVALID_SRC:
 862       Vlast_code_conversion_error = Qinvalid_source;
 863       break;
 864     case CODING_RESULT_INTERRUPT:
 865       Vlast_code_conversion_error = Qinterrupted;
 866       break;
 867     case CODING_RESULT_INSUFFICIENT_DST:
 868       /* Don't record this error in Vlast_code_conversion_error
 869          because it happens just temporarily and is resolved when the
 870          whole conversion is finished.  */
 871       break;
 872     case CODING_RESULT_SUCCESS:
 873       break;
 874     default:
 875       Vlast_code_conversion_error = intern ("Unknown error");
 876     }
 877 }
 878
 879 /* These wrapper macros are used to preserve validity of pointers into
 880    buffer text across calls to decode_char, encode_char, etc, which
 881    could cause relocation of buffers if it loads a charset map,
 882    because loading a charset map allocates large structures.  */
 883
 884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 885   do {                                                                       \
 886     ptrdiff_t offset;                                                        \
 887                                                                              \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded                                                   \
 891         && (offset = coding_change_source (coding)))                         \
 892       {                                                                      \
 893         src += offset;                                                       \
 894         src_base += offset;                                                  \
 895         src_end += offset;                                                   \
 896       }                                                                      \
 897   } while (0)
 898
 899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 900   do {                                                                  \
 901     ptrdiff_t offset;                                                   \
 902                                                                         \
 903     charset_map_loaded = 0;                                             \
 904     code = ENCODE_CHAR (charset, c);                                    \
 905     if (charset_map_loaded                                              \
 906         && (offset = coding_change_destination (coding)))               \
 907       {                                                                 \
 908         dst += offset;                                                  \
 909         dst_end += offset;                                              \
 910       }                                                                 \
 911   } while (0)
 912
 913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 914   do {                                                                  \
 915     ptrdiff_t offset;                                                   \
 916                                                                         \
 917     charset_map_loaded = 0;                                             \
 918     charset = char_charset (c, charset_list, code_return);              \
 919     if (charset_map_loaded                                              \
 920         && (offset = coding_change_destination (coding)))               \
 921       {                                                                 \
 922         dst += offset;                                                  \
 923         dst_end += offset;                                              \
 924       }                                                                 \
 925   } while (0)
 926
 927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 928   do {                                                                  \
 929     ptrdiff_t offset;                                                   \
 930                                                                         \
 931     charset_map_loaded = 0;                                             \
 932     result = CHAR_CHARSET_P (c, charset);                               \
 933     if (charset_map_loaded                                              \
 934         && (offset = coding_change_destination (coding)))               \
 935       {                                                                 \
 936         dst += offset;                                                  \
 937         dst_end += offset;                                              \
 938       }                                                                 \
 939   } while (0)
 940
 941
 942 /* If there are at least BYTES length of room at dst, allocate memory
 943    for coding->destination and update dst and dst_end.  We don't have
 944    to take care of coding->source which will be relocated.  It is
 945    handled by calling coding_set_source in encode_coding.  */
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959 /* Store multibyte form of the character C in P, and advance P to the
 960    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 961    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 962    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 963
 964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 965
 966 /* Return the character code of character whose multibyte form is at
 967    P, and advance P to the end of the multibyte form.  This used to be
 968    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 969    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 970
 971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 972
 973 /* Set coding->source from coding->src_object.  */
 974
 975 static void
 976 coding_set_source (struct coding_system *coding)
 977 {
 978   if (BUFFERP (coding->src_object))
 979     {
 980       struct buffer *buf = XBUFFER (coding->src_object);
 981
 982       if (coding->src_pos < 0)
 983         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 984       else
 985         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 986     }
 987   else if (STRINGP (coding->src_object))
 988     {
 989       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 990     }
 991   else
 992     {
 993       /* Otherwise, the source is C string and is never relocated
 994          automatically.  Thus we don't have to update anything.  */
 995     }
 996 }
 997
 998
 999 /* Set coding->source from coding->src_object, and return how many
1000    bytes coding->source was changed.  */
1001
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1004 {
1005   const unsigned char *orig = coding->source;
1006   coding_set_source (coding);
1007   return coding->source - orig;
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object.  */
1012
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1015 {
1016   if (BUFFERP (coding->dst_object))
1017     {
1018       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019         {
1020           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021           coding->dst_bytes = (GAP_END_ADDR
1022                                - (coding->src_bytes - coding->consumed)
1023                                - coding->destination);
1024         }
1025       else
1026         {
1027           /* We are sure that coding->dst_pos_byte is before the gap
1028              of the buffer. */
1029           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030                                  + coding->dst_pos_byte - BEG_BYTE);
1031           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032                                - coding->destination);
1033         }
1034     }
1035   else
1036     {
1037       /* Otherwise, the destination is C string and is never relocated
1038          automatically.  Thus we don't have to update anything.  */
1039     }
1040 }
1041
1042
1043 /* Set coding->destination from coding->dst_object, and return how
1044    many bytes coding->destination was changed.  */
1045
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1048 {
1049   const unsigned char *orig = coding->destination;
1050   coding_set_destination (coding);
1051   return coding->destination - orig;
1052 }
1053
1054
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 {
1058   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059     string_overflow ();
1060   coding->destination = xrealloc (coding->destination,
1061                                   coding->dst_bytes + bytes);
1062   coding->dst_bytes += bytes;
1063 }
1064
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 {
1069   if (EQ (coding->src_object, coding->dst_object))
1070     {
1071       /* The gap may contain the produced data at the head and not-yet
1072          consumed data at the tail.  To preserve those data, we at
1073          first make the gap size to zero, then increase the gap
1074          size.  */
1075       ptrdiff_t add = GAP_SIZE;
1076
1077       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079       make_gap (bytes);
1080       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082     }
1083   else
1084     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1085 }
1086
1087
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090                    unsigned char *dst)
1091 {
1092   ptrdiff_t offset = dst - coding->destination;
1093
1094   if (BUFFERP (coding->dst_object))
1095     {
1096       struct buffer *buf = XBUFFER (coding->dst_object);
1097
1098       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099     }
1100   else
1101     coding_alloc_by_realloc (coding, nbytes);
1102   coding_set_destination (coding);
1103   dst = coding->destination + offset;
1104   return dst;
1105 }
1106
1107 /** Macros for annotations.  */
1108
1109 /* An annotation data is stored in the array coding->charbuf in this
1110    format:
1111      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112    LENGTH is the number of elements in the annotation.
1113    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114    NCHARS is the number of characters in the text annotated.
1115
1116    The format of the following elements depend on ANNOTATION_MASK.
1117
1118    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119    follows:
1120      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121
1122    NBYTES is the number of bytes specified in the header part of
1123    old-style emacs-mule encoding, or 0 for the other kind of
1124    composition.
1125
1126    METHOD is one of enum composition_method.
1127
1128    Optional COMPOSITION-COMPONENTS are characters and composition
1129    rules.
1130
1131    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132    follows.
1133
1134    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135    recover from an invalid annotation, and should be skipped by
1136    produce_annotation.  */
1137
1138 /* Maximum length of the header of annotation data.  */
1139 #define MAX_ANNOTATION_LENGTH 5
1140
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1142   do {                                                  \
1143     *(buf)++ = -(len);                                  \
1144     *(buf)++ = (mask);                                  \
1145     *(buf)++ = (nchars);                                \
1146     coding->annotated = 1;                              \
1147   } while (0);
1148
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1150   do {                                                                      \
1151     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152     *buf++ = nbytes;                                                        \
1153     *buf++ = method;                                                        \
1154   } while (0)
1155
1156
1157 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1158   do {                                                                  \
1159     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160     *buf++ = id;                                                        \
1161   } while (0)
1162
1163
1164 /* Bitmasks for coding->eol_seen.  */
1165
1166 #define EOL_SEEN_NONE   0
1167 #define EOL_SEEN_LF     1
1168 #define EOL_SEEN_CR     2
1169 #define EOL_SEEN_CRLF   4
1170
1171 \f
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1173
1174
1175
1176 \f
1177 /*** 3. UTF-8 ***/
1178
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180    Return true if a text is encoded in UTF-8.  */
1181
1182 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1192
1193 /* Unlike the other detect_coding_XXX, this function counts number of
1194    characters and check EOL format.  */
1195
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198                      struct coding_detection_info *detect_info)
1199 {
1200   const unsigned char *src = coding->source, *src_base;
1201   const unsigned char *src_end = coding->source + coding->src_bytes;
1202   bool multibytep = coding->src_multibyte;
1203   ptrdiff_t consumed_chars = 0;
1204   bool bom_found = 0;
1205   ptrdiff_t nchars = coding->head_ascii;
1206   int eol_seen = coding->eol_seen;
1207
1208   detect_info->checked |= CATEGORY_MASK_UTF_8;
1209   /* A coding system of this category is always ASCII compatible.  */
1210   src += nchars;
1211
1212   if (src == coding->source     /* BOM should be at the head.  */
1213       && src + 3 < src_end      /* BOM is 3-byte long.  */
1214       && src[0] == UTF_8_BOM_1
1215       && src[1] == UTF_8_BOM_2
1216       && src[2] == UTF_8_BOM_3)
1217     {
1218       bom_found = 1;
1219       src += 3;
1220       nchars++;
1221     }
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4;
1226
1227       src_base = src;
1228       ONE_MORE_BYTE (c);
1229       if (c < 0 || UTF_8_1_OCTET_P (c))
1230         {
1231           nchars++;
1232           if (c == '\r')
1233             {
1234               if (src < src_end && *src == '\n')
1235                 {
1236                   eol_seen |= EOL_SEEN_CRLF;
1237                   src++;
1238                   nchars++;
1239                 }
1240               else
1241                 eol_seen |= EOL_SEEN_CR;
1242             }
1243           else if (c == '\n')
1244             eol_seen |= EOL_SEEN_LF;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c1);
1248       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249         break;
1250       if (UTF_8_2_OCTET_LEADING_P (c))
1251         {
1252           nchars++;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c2);
1256       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257         break;
1258       if (UTF_8_3_OCTET_LEADING_P (c))
1259         {
1260           nchars++;
1261           continue;
1262         }
1263       ONE_MORE_BYTE (c3);
1264       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265         break;
1266       if (UTF_8_4_OCTET_LEADING_P (c))
1267         {
1268           nchars++;
1269           continue;
1270         }
1271       ONE_MORE_BYTE (c4);
1272       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273         break;
1274       if (UTF_8_5_OCTET_LEADING_P (c))
1275         {
1276           nchars++;
1277           continue;
1278         }
1279       break;
1280     }
1281   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282   return 0;
1283
1284  no_more_source:
1285   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286     {
1287       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288       return 0;
1289     }
1290   if (bom_found)
1291     {
1292       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1293       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294     }
1295   else
1296     {
1297       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298       if (nchars < src_end - coding->source)
1299         /* The found characters are less than source bytes, which
1300            means that we found a valid non-ASCII characters.  */
1301         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302     }
1303   coding->detected_utf8_bytes = src_base - coding->source;
1304   coding->detected_utf8_chars = nchars;
1305   return 1;
1306 }
1307
1308
1309 static void
1310 decode_coding_utf_8 (struct coding_system *coding)
1311 {
1312   const unsigned char *src = coding->source + coding->consumed;
1313   const unsigned char *src_end = coding->source + coding->src_bytes;
1314   const unsigned char *src_base;
1315   int *charbuf = coding->charbuf + coding->charbuf_used;
1316   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1317   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1318   bool multibytep = coding->src_multibyte;
1319   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1320   bool eol_dos
1321     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1322   int byte_after_cr = -1;
1323
1324   if (bom != utf_without_bom)
1325     {
1326       int c1, c2, c3;
1327
1328       src_base = src;
1329       ONE_MORE_BYTE (c1);
1330       if (! UTF_8_3_OCTET_LEADING_P (c1))
1331         src = src_base;
1332       else
1333         {
1334           ONE_MORE_BYTE (c2);
1335           if (! UTF_8_EXTRA_OCTET_P (c2))
1336             src = src_base;
1337           else
1338             {
1339               ONE_MORE_BYTE (c3);
1340               if (! UTF_8_EXTRA_OCTET_P (c3))
1341                 src = src_base;
1342               else
1343                 {
1344                   if ((c1 != UTF_8_BOM_1)
1345                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1346                     src = src_base;
1347                   else
1348                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1349                 }
1350             }
1351         }
1352     }
1353   CODING_UTF_8_BOM (coding) = utf_without_bom;
1354
1355   while (1)
1356     {
1357       int c, c1, c2, c3, c4, c5;
1358
1359       src_base = src;
1360       consumed_chars_base = consumed_chars;
1361
1362       if (charbuf >= charbuf_end)
1363         {
1364           if (byte_after_cr >= 0)
1365             src_base--;
1366           break;
1367         }
1368
1369       /* In the simple case, rapidly handle ordinary characters */
1370       if (multibytep && ! eol_dos
1371           && charbuf < charbuf_end - 6 && src < src_end - 6)
1372         {
1373           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1374             {
1375               c1 = *src;
1376               if (c1 & 0x80)
1377                 break;
1378               src++;
1379               consumed_chars++;
1380               *charbuf++ = c1;
1381
1382               c1 = *src;
1383               if (c1 & 0x80)
1384                 break;
1385               src++;
1386               consumed_chars++;
1387               *charbuf++ = c1;
1388
1389               c1 = *src;
1390               if (c1 & 0x80)
1391                 break;
1392               src++;
1393               consumed_chars++;
1394               *charbuf++ = c1;
1395
1396               c1 = *src;
1397               if (c1 & 0x80)
1398                 break;
1399               src++;
1400               consumed_chars++;
1401               *charbuf++ = c1;
1402             }
1403           /* If we handled at least one character, restart the main loop.  */
1404           if (src != src_base)
1405             continue;
1406         }
1407
1408       if (byte_after_cr >= 0)
1409         c1 = byte_after_cr, byte_after_cr = -1;
1410       else
1411         ONE_MORE_BYTE (c1);
1412       if (c1 < 0)
1413         {
1414           c = - c1;
1415         }
1416       else if (UTF_8_1_OCTET_P (c1))
1417         {
1418           if (eol_dos && c1 == '\r')
1419             ONE_MORE_BYTE (byte_after_cr);
1420           c = c1;
1421         }
1422       else
1423         {
1424           ONE_MORE_BYTE (c2);
1425           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1426             goto invalid_code;
1427           if (UTF_8_2_OCTET_LEADING_P (c1))
1428             {
1429               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1430               /* Reject overlong sequences here and below.  Encoders
1431                  producing them are incorrect, they can be misleading,
1432                  and they mess up read/write invariance.  */
1433               if (c < 128)
1434                 goto invalid_code;
1435             }
1436           else
1437             {
1438               ONE_MORE_BYTE (c3);
1439               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1440                 goto invalid_code;
1441               if (UTF_8_3_OCTET_LEADING_P (c1))
1442                 {
1443                   c = (((c1 & 0xF) << 12)
1444                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1445                   if (c < 0x800
1446                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1447                     goto invalid_code;
1448                 }
1449               else
1450                 {
1451                   ONE_MORE_BYTE (c4);
1452                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1453                     goto invalid_code;
1454                   if (UTF_8_4_OCTET_LEADING_P (c1))
1455                     {
1456                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1457                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1458                     if (c < 0x10000)
1459                       goto invalid_code;
1460                     }
1461                   else
1462                     {
1463                       ONE_MORE_BYTE (c5);
1464                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1465                         goto invalid_code;
1466                       if (UTF_8_5_OCTET_LEADING_P (c1))
1467                         {
1468                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1469                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1470                                | (c5 & 0x3F));
1471                           if ((c > MAX_CHAR) || (c < 0x200000))
1472                             goto invalid_code;
1473                         }
1474                       else
1475                         goto invalid_code;
1476                     }
1477                 }
1478             }
1479         }
1480
1481       *charbuf++ = c;
1482       continue;
1483
1484     invalid_code:
1485       src = src_base;
1486       consumed_chars = consumed_chars_base;
1487       ONE_MORE_BYTE (c);
1488       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1489       coding->errors++;
1490     }
1491
1492  no_more_source:
1493   coding->consumed_char += consumed_chars_base;
1494   coding->consumed = src_base - coding->source;
1495   coding->charbuf_used = charbuf - coding->charbuf;
1496 }
1497
1498
1499 static bool
1500 encode_coding_utf_8 (struct coding_system *coding)
1501 {
1502   bool multibytep = coding->dst_multibyte;
1503   int *charbuf = coding->charbuf;
1504   int *charbuf_end = charbuf + coding->charbuf_used;
1505   unsigned char *dst = coding->destination + coding->produced;
1506   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1507   ptrdiff_t produced_chars = 0;
1508   int c;
1509
1510   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1511     {
1512       ASSURE_DESTINATION (3);
1513       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1514       CODING_UTF_8_BOM (coding) = utf_without_bom;
1515     }
1516
1517   if (multibytep)
1518     {
1519       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1520
1521       while (charbuf < charbuf_end)
1522         {
1523           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1524
1525           ASSURE_DESTINATION (safe_room);
1526           c = *charbuf++;
1527           if (CHAR_BYTE8_P (c))
1528             {
1529               c = CHAR_TO_BYTE8 (c);
1530               EMIT_ONE_BYTE (c);
1531             }
1532           else
1533             {
1534               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1535               for (p = str; p < pend; p++)
1536                 EMIT_ONE_BYTE (*p);
1537             }
1538         }
1539     }
1540   else
1541     {
1542       int safe_room = MAX_MULTIBYTE_LENGTH;
1543
1544       while (charbuf < charbuf_end)
1545         {
1546           ASSURE_DESTINATION (safe_room);
1547           c = *charbuf++;
1548           if (CHAR_BYTE8_P (c))
1549             *dst++ = CHAR_TO_BYTE8 (c);
1550           else
1551             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1552         }
1553       produced_chars = dst - (coding->destination + coding->produced);
1554     }
1555   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1556   coding->produced_char += produced_chars;
1557   coding->produced = dst - coding->destination;
1558   return 0;
1559 }
1560
1561
1562 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1563    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1564
1565 #define UTF_16_HIGH_SURROGATE_P(val) \
1566   (((val) & 0xFC00) == 0xD800)
1567
1568 #define UTF_16_LOW_SURROGATE_P(val) \
1569   (((val) & 0xFC00) == 0xDC00)
1570
1571
1572 static bool
1573 detect_coding_utf_16 (struct coding_system *coding,
1574                       struct coding_detection_info *detect_info)
1575 {
1576   const unsigned char *src = coding->source;
1577   const unsigned char *src_end = coding->source + coding->src_bytes;
1578   bool multibytep = coding->src_multibyte;
1579   int c1, c2;
1580
1581   detect_info->checked |= CATEGORY_MASK_UTF_16;
1582   if (coding->mode & CODING_MODE_LAST_BLOCK
1583       && (coding->src_chars & 1))
1584     {
1585       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1586       return 0;
1587     }
1588
1589   TWO_MORE_BYTES (c1, c2);
1590   if ((c1 == 0xFF) && (c2 == 0xFE))
1591     {
1592       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1593                              | CATEGORY_MASK_UTF_16_AUTO);
1594       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1595                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1596                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1597     }
1598   else if ((c1 == 0xFE) && (c2 == 0xFF))
1599     {
1600       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1601                              | CATEGORY_MASK_UTF_16_AUTO);
1602       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1603                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1604                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1605     }
1606   else if (c2 < 0)
1607     {
1608       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1609       return 0;
1610     }
1611   else
1612     {
1613       /* We check the dispersion of Eth and Oth bytes where E is even and
1614          O is odd.  If both are high, we assume binary data.*/
1615       unsigned char e[256], o[256];
1616       unsigned e_num = 1, o_num = 1;
1617
1618       memset (e, 0, 256);
1619       memset (o, 0, 256);
1620       e[c1] = 1;
1621       o[c2] = 1;
1622
1623       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1624                                 |CATEGORY_MASK_UTF_16_BE
1625                                 | CATEGORY_MASK_UTF_16_LE);
1626
1627       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1628              != CATEGORY_MASK_UTF_16)
1629         {
1630           TWO_MORE_BYTES (c1, c2);
1631           if (c2 < 0)
1632             break;
1633           if (! e[c1])
1634             {
1635               e[c1] = 1;
1636               e_num++;
1637               if (e_num >= 128)
1638                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1639             }
1640           if (! o[c2])
1641             {
1642               o[c2] = 1;
1643               o_num++;
1644               if (o_num >= 128)
1645                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1646             }
1647         }
1648       return 0;
1649     }
1650
1651  no_more_source:
1652   return 1;
1653 }
1654
1655 static void
1656 decode_coding_utf_16 (struct coding_system *coding)
1657 {
1658   const unsigned char *src = coding->source + coding->consumed;
1659   const unsigned char *src_end = coding->source + coding->src_bytes;
1660   const unsigned char *src_base;
1661   int *charbuf = coding->charbuf + coding->charbuf_used;
1662   /* We may produces at most 3 chars in one loop.  */
1663   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1664   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1665   bool multibytep = coding->src_multibyte;
1666   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1667   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1668   int surrogate = CODING_UTF_16_SURROGATE (coding);
1669   bool eol_dos
1670     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1671   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1672
1673   if (bom == utf_with_bom)
1674     {
1675       int c, c1, c2;
1676
1677       src_base = src;
1678       ONE_MORE_BYTE (c1);
1679       ONE_MORE_BYTE (c2);
1680       c = (c1 << 8) | c2;
1681
1682       if (endian == utf_16_big_endian
1683           ? c != 0xFEFF : c != 0xFFFE)
1684         {
1685           /* The first two bytes are not BOM.  Treat them as bytes
1686              for a normal character.  */
1687           src = src_base;
1688           coding->errors++;
1689         }
1690       CODING_UTF_16_BOM (coding) = utf_without_bom;
1691     }
1692   else if (bom == utf_detect_bom)
1693     {
1694       /* We have already tried to detect BOM and failed in
1695          detect_coding.  */
1696       CODING_UTF_16_BOM (coding) = utf_without_bom;
1697     }
1698
1699   while (1)
1700     {
1701       int c, c1, c2;
1702
1703       src_base = src;
1704       consumed_chars_base = consumed_chars;
1705
1706       if (charbuf >= charbuf_end)
1707         {
1708           if (byte_after_cr1 >= 0)
1709             src_base -= 2;
1710           break;
1711         }
1712
1713       if (byte_after_cr1 >= 0)
1714         c1 = byte_after_cr1, byte_after_cr1 = -1;
1715       else
1716         ONE_MORE_BYTE (c1);
1717       if (c1 < 0)
1718         {
1719           *charbuf++ = -c1;
1720           continue;
1721         }
1722       if (byte_after_cr2 >= 0)
1723         c2 = byte_after_cr2, byte_after_cr2 = -1;
1724       else
1725         ONE_MORE_BYTE (c2);
1726       if (c2 < 0)
1727         {
1728           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1729           *charbuf++ = -c2;
1730           continue;
1731         }
1732       c = (endian == utf_16_big_endian
1733            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1734
1735       if (surrogate)
1736         {
1737           if (! UTF_16_LOW_SURROGATE_P (c))
1738             {
1739               if (endian == utf_16_big_endian)
1740                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1741               else
1742                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1743               *charbuf++ = c1;
1744               *charbuf++ = c2;
1745               coding->errors++;
1746               if (UTF_16_HIGH_SURROGATE_P (c))
1747                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1748               else
1749                 *charbuf++ = c;
1750             }
1751           else
1752             {
1753               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1754               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1755               *charbuf++ = 0x10000 + c;
1756             }
1757         }
1758       else
1759         {
1760           if (UTF_16_HIGH_SURROGATE_P (c))
1761             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1762           else
1763             {
1764               if (eol_dos && c == '\r')
1765                 {
1766                   ONE_MORE_BYTE (byte_after_cr1);
1767                   ONE_MORE_BYTE (byte_after_cr2);
1768                 }
1769               *charbuf++ = c;
1770             }
1771         }
1772     }
1773
1774  no_more_source:
1775   coding->consumed_char += consumed_chars_base;
1776   coding->consumed = src_base - coding->source;
1777   coding->charbuf_used = charbuf - coding->charbuf;
1778 }
1779
1780 static bool
1781 encode_coding_utf_16 (struct coding_system *coding)
1782 {
1783   bool multibytep = coding->dst_multibyte;
1784   int *charbuf = coding->charbuf;
1785   int *charbuf_end = charbuf + coding->charbuf_used;
1786   unsigned char *dst = coding->destination + coding->produced;
1787   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1788   int safe_room = 8;
1789   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1790   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1791   ptrdiff_t produced_chars = 0;
1792   int c;
1793
1794   if (bom != utf_without_bom)
1795     {
1796       ASSURE_DESTINATION (safe_room);
1797       if (big_endian)
1798         EMIT_TWO_BYTES (0xFE, 0xFF);
1799       else
1800         EMIT_TWO_BYTES (0xFF, 0xFE);
1801       CODING_UTF_16_BOM (coding) = utf_without_bom;
1802     }
1803
1804   while (charbuf < charbuf_end)
1805     {
1806       ASSURE_DESTINATION (safe_room);
1807       c = *charbuf++;
1808       if (c > MAX_UNICODE_CHAR)
1809         c = coding->default_char;
1810
1811       if (c < 0x10000)
1812         {
1813           if (big_endian)
1814             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1815           else
1816             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1817         }
1818       else
1819         {
1820           int c1, c2;
1821
1822           c -= 0x10000;
1823           c1 = (c >> 10) + 0xD800;
1824           c2 = (c & 0x3FF) + 0xDC00;
1825           if (big_endian)
1826             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1827           else
1828             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1829         }
1830     }
1831   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1832   coding->produced = dst - coding->destination;
1833   coding->produced_char += produced_chars;
1834   return 0;
1835 }
1836
1837 \f
1838 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1839
1840 /* Emacs' internal format for representation of multiple character
1841    sets is a kind of multi-byte encoding, i.e. characters are
1842    represented by variable-length sequences of one-byte codes.
1843
1844    ASCII characters and control characters (e.g. `tab', `newline') are
1845    represented by one-byte sequences which are their ASCII codes, in
1846    the range 0x00 through 0x7F.
1847
1848    8-bit characters of the range 0x80..0x9F are represented by
1849    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1850    code + 0x20).
1851
1852    8-bit characters of the range 0xA0..0xFF are represented by
1853    one-byte sequences which are their 8-bit code.
1854
1855    The other characters are represented by a sequence of `base
1856    leading-code', optional `extended leading-code', and one or two
1857    `position-code's.  The length of the sequence is determined by the
1858    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1859    whereas extended leading-code and position-code take the range 0xA0
1860    through 0xFF.  See `charset.h' for more details about leading-code
1861    and position-code.
1862
1863    --- CODE RANGE of Emacs' internal format ---
1864    character set        range
1865    -------------        -----
1866    ascii                0x00..0x7F
1867    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1868    eight-bit-graphic    0xA0..0xBF
1869    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1870    ---------------------------------------------
1871
1872    As this is the internal character representation, the format is
1873    usually not used externally (i.e. in a file or in a data sent to a
1874    process).  But, it is possible to have a text externally in this
1875    format (i.e. by encoding by the coding system `emacs-mule').
1876
1877    In that case, a sequence of one-byte codes has a slightly different
1878    form.
1879
1880    At first, all characters in eight-bit-control are represented by
1881    one-byte sequences which are their 8-bit code.
1882
1883    Next, character composition data are represented by the byte
1884    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1885    where,
1886         METHOD is 0xF2 plus one of composition method (enum
1887         composition_method),
1888
1889         BYTES is 0xA0 plus a byte length of this composition data,
1890
1891         CHARS is 0xA0 plus a number of characters composed by this
1892         data,
1893
1894         COMPONENTs are characters of multibyte form or composition
1895         rules encoded by two-byte of ASCII codes.
1896
1897    In addition, for backward compatibility, the following formats are
1898    also recognized as composition data on decoding.
1899
1900    0x80 MSEQ ...
1901    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1902
1903    Here,
1904         MSEQ is a multibyte form but in these special format:
1905           ASCII: 0xA0 ASCII_CODE+0x80,
1906           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1907         RULE is a one byte code of the range 0xA0..0xF0 that
1908         represents a composition rule.
1909   */
1910
1911 char emacs_mule_bytes[256];
1912
1913
1914 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1915    Return true if a text is encoded in 'emacs-mule'.  */
1916
1917 static bool
1918 detect_coding_emacs_mule (struct coding_system *coding,
1919                           struct coding_detection_info *detect_info)
1920 {
1921   const unsigned char *src = coding->source, *src_base;
1922   const unsigned char *src_end = coding->source + coding->src_bytes;
1923   bool multibytep = coding->src_multibyte;
1924   ptrdiff_t consumed_chars = 0;
1925   int c;
1926   int found = 0;
1927
1928   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1929   /* A coding system of this category is always ASCII compatible.  */
1930   src += coding->head_ascii;
1931
1932   while (1)
1933     {
1934       src_base = src;
1935       ONE_MORE_BYTE (c);
1936       if (c < 0)
1937         continue;
1938       if (c == 0x80)
1939         {
1940           /* Perhaps the start of composite character.  We simply skip
1941              it because analyzing it is too heavy for detecting.  But,
1942              at least, we check that the composite character
1943              constitutes of more than 4 bytes.  */
1944           const unsigned char *src_start;
1945
1946         repeat:
1947           src_start = src;
1948           do
1949             {
1950               ONE_MORE_BYTE (c);
1951             }
1952           while (c >= 0xA0);
1953
1954           if (src - src_start <= 4)
1955             break;
1956           found = CATEGORY_MASK_EMACS_MULE;
1957           if (c == 0x80)
1958             goto repeat;
1959         }
1960
1961       if (c < 0x80)
1962         {
1963           if (c < 0x20
1964               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1965             break;
1966         }
1967       else
1968         {
1969           int more_bytes = emacs_mule_bytes[c] - 1;
1970
1971           while (more_bytes > 0)
1972             {
1973               ONE_MORE_BYTE (c);
1974               if (c < 0xA0)
1975                 {
1976                   src--;        /* Unread the last byte.  */
1977                   break;
1978                 }
1979               more_bytes--;
1980             }
1981           if (more_bytes != 0)
1982             break;
1983           found = CATEGORY_MASK_EMACS_MULE;
1984         }
1985     }
1986   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1987   return 0;
1988
1989  no_more_source:
1990   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1991     {
1992       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1993       return 0;
1994     }
1995   detect_info->found |= found;
1996   return 1;
1997 }
1998
1999
2000 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2001    character.  If CMP_STATUS indicates that we must expect MSEQ or
2002    RULE described above, decode it and return the negative value of
2003    the decoded character or rule.  If an invalid byte is found, return
2004    -1.  If SRC is too short, return -2.  */
2005
2006 static int
2007 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2008                  int *nbytes, int *nchars, int *id,
2009                  struct composition_status *cmp_status)
2010 {
2011   const unsigned char *src_end = coding->source + coding->src_bytes;
2012   const unsigned char *src_base = src;
2013   bool multibytep = coding->src_multibyte;
2014   int charset_ID;
2015   unsigned code;
2016   int c;
2017   ptrdiff_t consumed_chars = 0;
2018   bool mseq_found = 0;
2019
2020   ONE_MORE_BYTE (c);
2021   if (c < 0)
2022     {
2023       c = -c;
2024       charset_ID = emacs_mule_charset[0];
2025     }
2026   else
2027     {
2028       if (c >= 0xA0)
2029         {
2030           if (cmp_status->state != COMPOSING_NO
2031               && cmp_status->old_form)
2032             {
2033               if (cmp_status->state == COMPOSING_CHAR)
2034                 {
2035                   if (c == 0xA0)
2036                     {
2037                       ONE_MORE_BYTE (c);
2038                       c -= 0x80;
2039                       if (c < 0)
2040                         goto invalid_code;
2041                     }
2042                   else
2043                     c -= 0x20;
2044                   mseq_found = 1;
2045                 }
2046               else
2047                 {
2048                   *nbytes = src - src_base;
2049                   *nchars = consumed_chars;
2050                   return -c;
2051                 }
2052             }
2053           else
2054             goto invalid_code;
2055         }
2056
2057       switch (emacs_mule_bytes[c])
2058         {
2059         case 2:
2060           if ((charset_ID = emacs_mule_charset[c]) < 0)
2061             goto invalid_code;
2062           ONE_MORE_BYTE (c);
2063           if (c < 0xA0)
2064             goto invalid_code;
2065           code = c & 0x7F;
2066           break;
2067
2068         case 3:
2069           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2070               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2071             {
2072               ONE_MORE_BYTE (c);
2073               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2074                 goto invalid_code;
2075               ONE_MORE_BYTE (c);
2076               if (c < 0xA0)
2077                 goto invalid_code;
2078               code = c & 0x7F;
2079             }
2080           else
2081             {
2082               if ((charset_ID = emacs_mule_charset[c]) < 0)
2083                 goto invalid_code;
2084               ONE_MORE_BYTE (c);
2085               if (c < 0xA0)
2086                 goto invalid_code;
2087               code = (c & 0x7F) << 8;
2088               ONE_MORE_BYTE (c);
2089               if (c < 0xA0)
2090                 goto invalid_code;
2091               code |= c & 0x7F;
2092             }
2093           break;
2094
2095         case 4:
2096           ONE_MORE_BYTE (c);
2097           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2098             goto invalid_code;
2099           ONE_MORE_BYTE (c);
2100           if (c < 0xA0)
2101             goto invalid_code;
2102           code = (c & 0x7F) << 8;
2103           ONE_MORE_BYTE (c);
2104           if (c < 0xA0)
2105             goto invalid_code;
2106           code |= c & 0x7F;
2107           break;
2108
2109         case 1:
2110           code = c;
2111           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2112           break;
2113
2114         default:
2115           emacs_abort ();
2116         }
2117       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2118                           CHARSET_FROM_ID (charset_ID), code, c);
2119       if (c < 0)
2120         goto invalid_code;
2121     }
2122   *nbytes = src - src_base;
2123   *nchars = consumed_chars;
2124   if (id)
2125     *id = charset_ID;
2126   return (mseq_found ? -c : c);
2127
2128  no_more_source:
2129   return -2;
2130
2131  invalid_code:
2132   return -1;
2133 }
2134
2135
2136 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2137
2138 /* Handle these composition sequence ('|': the end of header elements,
2139    BYTES and CHARS >= 0xA0):
2140
2141    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2142    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2143    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2144
2145    and these old form:
2146
2147    (4) relative composition: 0x80 | MSEQ ... MSEQ
2148    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2149
2150    When the starter 0x80 and the following header elements are found,
2151    this annotation header is produced.
2152
2153         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2154
2155    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2156    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157
2158    Then, upon reading the following elements, these codes are produced
2159    until the composition end is found:
2160
2161    (1) CHAR ... CHAR
2162    (2) ALT ... ALT CHAR ... CHAR
2163    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2164    (4) CHAR ... CHAR
2165    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2166
2167    When the composition end is found, LENGTH and NCHARS in the
2168    annotation header is updated as below:
2169
2170    (1) LENGTH: unchanged, NCHARS: unchanged
2171    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2172    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2174    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2175
2176    If an error is found while composing, the annotation header is
2177    changed to the original composition header (plus filler -1s) as
2178    below:
2179
2180    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2181    (5)          [ 0x80 0xFF -1 -1- -1 ]
2182
2183    and the sequence [ -2 DECODED-RULE ] is changed to the original
2184    byte sequence as below:
2185         o the original byte sequence is B: [ B -1 ]
2186         o the original byte sequence is B1 B2: [ B1 B2 ]
2187
2188    Most of the routines are implemented by macros because many
2189    variables and labels in the caller decode_coding_emacs_mule must be
2190    accessible, and they are usually called just once (thus doesn't
2191    increase the size of compiled object).  */
2192
2193 /* Decode a composition rule represented by C as a component of
2194    composition sequence of Emacs 20 style.  Set RULE to the decoded
2195    rule. */
2196
2197 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2198   do {                                                  \
2199     int gref, nref;                                     \
2200                                                         \
2201     c -= 0xA0;                                          \
2202     if (c < 0 || c >= 81)                               \
2203       goto invalid_code;                                \
2204     gref = c / 9, nref = c % 9;                         \
2205     if (gref == 4) gref = 10;                           \
2206     if (nref == 4) nref = 10;                           \
2207     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2208   } while (0)
2209
2210
2211 /* Decode a composition rule represented by C and the following byte
2212    at SRC as a component of composition sequence of Emacs 21 style.
2213    Set RULE to the decoded rule.  */
2214
2215 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2216   do {                                                  \
2217     int gref, nref;                                     \
2218                                                         \
2219     gref = c - 0x20;                                    \
2220     if (gref < 0 || gref >= 81)                         \
2221       goto invalid_code;                                \
2222     ONE_MORE_BYTE (c);                                  \
2223     nref = c - 0x20;                                    \
2224     if (nref < 0 || nref >= 81)                         \
2225       goto invalid_code;                                \
2226     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2227   } while (0)
2228
2229
2230 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2231    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2232    byte length of this composition information, CHARS is the number of
2233    characters composed by this composition.  */
2234
2235 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2236   do {                                                                  \
2237     enum composition_method method = c - 0xF2;                          \
2238     int nbytes, nchars;                                                 \
2239                                                                         \
2240     ONE_MORE_BYTE (c);                                                  \
2241     if (c < 0)                                                          \
2242       goto invalid_code;                                                \
2243     nbytes = c - 0xA0;                                                  \
2244     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2245       goto invalid_code;                                                \
2246     ONE_MORE_BYTE (c);                                                  \
2247     nchars = c - 0xA0;                                                  \
2248     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2249       goto invalid_code;                                                \
2250     cmp_status->old_form = 0;                                           \
2251     cmp_status->method = method;                                        \
2252     if (method == COMPOSITION_RELATIVE)                                 \
2253       cmp_status->state = COMPOSING_CHAR;                               \
2254     else                                                                \
2255       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2256     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2257     cmp_status->nchars = nchars;                                        \
2258     cmp_status->ncomps = nbytes - 4;                                    \
2259     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2260   } while (0)
2261
2262
2263 /* Start of Emacs 20 style format for relative composition.  */
2264
2265 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2266   do {                                                          \
2267     cmp_status->old_form = 1;                                   \
2268     cmp_status->method = COMPOSITION_RELATIVE;                  \
2269     cmp_status->state = COMPOSING_CHAR;                         \
2270     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2271     cmp_status->nchars = cmp_status->ncomps = 0;                \
2272     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2273   } while (0)
2274
2275
2276 /* Start of Emacs 20 style format for rule-base composition.  */
2277
2278 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2279   do {                                                          \
2280     cmp_status->old_form = 1;                                   \
2281     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2282     cmp_status->state = COMPOSING_CHAR;                         \
2283     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2284     cmp_status->nchars = cmp_status->ncomps = 0;                \
2285     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2286   } while (0)
2287
2288
2289 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2290   do {                                                  \
2291     const unsigned char *current_src = src;             \
2292                                                         \
2293     ONE_MORE_BYTE (c);                                  \
2294     if (c < 0)                                          \
2295       goto invalid_code;                                \
2296     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2297         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2298       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2299     else if (c < 0xA0)                                  \
2300       goto invalid_code;                                \
2301     else if (c < 0xC0)                                  \
2302       {                                                 \
2303         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2304         /* Re-read C as a composition component.  */    \
2305         src = current_src;                              \
2306       }                                                 \
2307     else if (c == 0xFF)                                 \
2308       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2309     else                                                \
2310       goto invalid_code;                                \
2311   } while (0)
2312
2313 #define EMACS_MULE_COMPOSITION_END()                            \
2314   do {                                                          \
2315     int idx = - cmp_status->length;                             \
2316                                                                 \
2317     if (cmp_status->old_form)                                   \
2318       charbuf[idx + 2] = cmp_status->nchars;                    \
2319     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2320       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2321     cmp_status->state = COMPOSING_NO;                           \
2322   } while (0)
2323
2324
2325 static int
2326 emacs_mule_finish_composition (int *charbuf,
2327                                struct composition_status *cmp_status)
2328 {
2329   int idx = - cmp_status->length;
2330   int new_chars;
2331
2332   if (cmp_status->old_form && cmp_status->nchars > 0)
2333     {
2334       charbuf[idx + 2] = cmp_status->nchars;
2335       new_chars = 0;
2336       if (cmp_status->method == COMPOSITION_WITH_RULE
2337           && cmp_status->state == COMPOSING_CHAR)
2338         {
2339           /* The last rule was invalid.  */
2340           int rule = charbuf[-1] + 0xA0;
2341
2342           charbuf[-2] = BYTE8_TO_CHAR (rule);
2343           charbuf[-1] = -1;
2344           new_chars = 1;
2345         }
2346     }
2347   else
2348     {
2349       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2350
2351       if (cmp_status->method == COMPOSITION_WITH_RULE)
2352         {
2353           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2354           charbuf[idx++] = -3;
2355           charbuf[idx++] = 0;
2356           new_chars = 1;
2357         }
2358       else
2359         {
2360           int nchars = charbuf[idx + 1] + 0xA0;
2361           int nbytes = charbuf[idx + 2] + 0xA0;
2362
2363           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2364           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2365           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2366           charbuf[idx++] = -1;
2367           new_chars = 4;
2368         }
2369     }
2370   cmp_status->state = COMPOSING_NO;
2371   return new_chars;
2372 }
2373
2374 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2375   do {                                                                    \
2376     if (cmp_status->state != COMPOSING_NO)                                \
2377       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2378   } while (0)
2379
2380
2381 static void
2382 decode_coding_emacs_mule (struct coding_system *coding)
2383 {
2384   const unsigned char *src = coding->source + coding->consumed;
2385   const unsigned char *src_end = coding->source + coding->src_bytes;
2386   const unsigned char *src_base;
2387   int *charbuf = coding->charbuf + coding->charbuf_used;
2388   /* We may produce two annotations (charset and composition) in one
2389      loop and one more charset annotation at the end.  */
2390   int *charbuf_end
2391     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2392       /* We can produce up to 2 characters in a loop.  */
2393       - 1;
2394   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2395   bool multibytep = coding->src_multibyte;
2396   ptrdiff_t char_offset = coding->produced_char;
2397   ptrdiff_t last_offset = char_offset;
2398   int last_id = charset_ascii;
2399   bool eol_dos
2400     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2401   int byte_after_cr = -1;
2402   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2403
2404   if (cmp_status->state != COMPOSING_NO)
2405     {
2406       int i;
2407
2408       if (charbuf_end - charbuf < cmp_status->length)
2409         emacs_abort ();
2410       for (i = 0; i < cmp_status->length; i++)
2411         *charbuf++ = cmp_status->carryover[i];
2412       coding->annotated = 1;
2413     }
2414
2415   while (1)
2416     {
2417       int c, id IF_LINT (= 0);
2418
2419       src_base = src;
2420       consumed_chars_base = consumed_chars;
2421
2422       if (charbuf >= charbuf_end)
2423         {
2424           if (byte_after_cr >= 0)
2425             src_base--;
2426           break;
2427         }
2428
2429       if (byte_after_cr >= 0)
2430         c = byte_after_cr, byte_after_cr = -1;
2431       else
2432         ONE_MORE_BYTE (c);
2433
2434       if (c < 0 || c == 0x80)
2435         {
2436           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2437           if (c < 0)
2438             {
2439               *charbuf++ = -c;
2440               char_offset++;
2441             }
2442           else
2443             DECODE_EMACS_MULE_COMPOSITION_START ();
2444           continue;
2445         }
2446
2447       if (c < 0x80)
2448         {
2449           if (eol_dos && c == '\r')
2450             ONE_MORE_BYTE (byte_after_cr);
2451           id = charset_ascii;
2452           if (cmp_status->state != COMPOSING_NO)
2453             {
2454               if (cmp_status->old_form)
2455                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2457                 cmp_status->ncomps--;
2458             }
2459         }
2460       else
2461         {
2462           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2463           /* emacs_mule_char can load a charset map from a file, which
2464              allocates a large structure and might cause buffer text
2465              to be relocated as result.  Thus, we need to remember the
2466              original pointer to buffer text, and fix up all related
2467              pointers after the call.  */
2468           const unsigned char *orig = coding->source;
2469           ptrdiff_t offset;
2470
2471           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2472                                cmp_status);
2473           offset = coding->source - orig;
2474           if (offset)
2475             {
2476               src += offset;
2477               src_base += offset;
2478               src_end += offset;
2479             }
2480           if (c < 0)
2481             {
2482               if (c == -1)
2483                 goto invalid_code;
2484               if (c == -2)
2485                 break;
2486             }
2487           src = src_base + nbytes;
2488           consumed_chars = consumed_chars_base + nchars;
2489           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2490             cmp_status->ncomps -= nchars;
2491         }
2492
2493       /* Now if C >= 0, we found a normally encoded character, if C <
2494          0, we found an old-style composition component character or
2495          rule.  */
2496
2497       if (cmp_status->state == COMPOSING_NO)
2498         {
2499           if (last_id != id)
2500             {
2501               if (last_id != charset_ascii)
2502                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2503                                   last_id);
2504               last_id = id;
2505               last_offset = char_offset;
2506             }
2507           *charbuf++ = c;
2508           char_offset++;
2509         }
2510       else if (cmp_status->state == COMPOSING_CHAR)
2511         {
2512           if (cmp_status->old_form)
2513             {
2514               if (c >= 0)
2515                 {
2516                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2517                   *charbuf++ = c;
2518                   char_offset++;
2519                 }
2520               else
2521                 {
2522                   *charbuf++ = -c;
2523                   cmp_status->nchars++;
2524                   cmp_status->length++;
2525                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2526                     EMACS_MULE_COMPOSITION_END ();
2527                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2528                     cmp_status->state = COMPOSING_RULE;
2529                 }
2530             }
2531           else
2532             {
2533               *charbuf++ = c;
2534               cmp_status->length++;
2535               cmp_status->nchars--;
2536               if (cmp_status->nchars == 0)
2537                 EMACS_MULE_COMPOSITION_END ();
2538             }
2539         }
2540       else if (cmp_status->state == COMPOSING_RULE)
2541         {
2542           int rule;
2543
2544           if (c >= 0)
2545             {
2546               EMACS_MULE_COMPOSITION_END ();
2547               *charbuf++ = c;
2548               char_offset++;
2549             }
2550           else
2551             {
2552               c = -c;
2553               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2554               if (rule < 0)
2555                 goto invalid_code;
2556               *charbuf++ = -2;
2557               *charbuf++ = rule;
2558               cmp_status->length += 2;
2559               cmp_status->state = COMPOSING_CHAR;
2560             }
2561         }
2562       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2563         {
2564           *charbuf++ = c;
2565           cmp_status->length++;
2566           if (cmp_status->ncomps == 0)
2567             cmp_status->state = COMPOSING_CHAR;
2568           else if (cmp_status->ncomps > 0)
2569             {
2570               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2571                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2572             }
2573           else
2574             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2575         }
2576       else                      /* COMPOSING_COMPONENT_RULE */
2577         {
2578           int rule;
2579
2580           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2581           if (rule < 0)
2582             goto invalid_code;
2583           *charbuf++ = -2;
2584           *charbuf++ = rule;
2585           cmp_status->length += 2;
2586           cmp_status->ncomps--;
2587           if (cmp_status->ncomps > 0)
2588             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2589           else
2590             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2591         }
2592       continue;
2593
2594     invalid_code:
2595       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2596       src = src_base;
2597       consumed_chars = consumed_chars_base;
2598       ONE_MORE_BYTE (c);
2599       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2600       char_offset++;
2601       coding->errors++;
2602     }
2603
2604  no_more_source:
2605   if (cmp_status->state != COMPOSING_NO)
2606     {
2607       if (coding->mode & CODING_MODE_LAST_BLOCK)
2608         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2609       else
2610         {
2611           int i;
2612
2613           charbuf -= cmp_status->length;
2614           for (i = 0; i < cmp_status->length; i++)
2615             cmp_status->carryover[i] = charbuf[i];
2616         }
2617     }
2618   if (last_id != charset_ascii)
2619     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2620   coding->consumed_char += consumed_chars_base;
2621   coding->consumed = src_base - coding->source;
2622   coding->charbuf_used = charbuf - coding->charbuf;
2623 }
2624
2625
2626 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2627   do {                                          \
2628     if (id < 0xA0)                              \
2629       codes[0] = id, codes[1] = 0;              \
2630     else if (id < 0xE0)                         \
2631       codes[0] = 0x9A, codes[1] = id;           \
2632     else if (id < 0xF0)                         \
2633       codes[0] = 0x9B, codes[1] = id;           \
2634     else if (id < 0xF5)                         \
2635       codes[0] = 0x9C, codes[1] = id;           \
2636     else                                        \
2637       codes[0] = 0x9D, codes[1] = id;           \
2638   } while (0);
2639
2640
2641 static bool
2642 encode_coding_emacs_mule (struct coding_system *coding)
2643 {
2644   bool multibytep = coding->dst_multibyte;
2645   int *charbuf = coding->charbuf;
2646   int *charbuf_end = charbuf + coding->charbuf_used;
2647   unsigned char *dst = coding->destination + coding->produced;
2648   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2649   int safe_room = 8;
2650   ptrdiff_t produced_chars = 0;
2651   Lisp_Object attrs, charset_list;
2652   int c;
2653   int preferred_charset_id = -1;
2654
2655   CODING_GET_INFO (coding, attrs, charset_list);
2656   if (! EQ (charset_list, Vemacs_mule_charset_list))
2657     {
2658       charset_list = Vemacs_mule_charset_list;
2659       ASET (attrs, coding_attr_charset_list, charset_list);
2660     }
2661
2662   while (charbuf < charbuf_end)
2663     {
2664       ASSURE_DESTINATION (safe_room);
2665       c = *charbuf++;
2666
2667       if (c < 0)
2668         {
2669           /* Handle an annotation.  */
2670           switch (*charbuf)
2671             {
2672             case CODING_ANNOTATE_COMPOSITION_MASK:
2673               /* Not yet implemented.  */
2674               break;
2675             case CODING_ANNOTATE_CHARSET_MASK:
2676               preferred_charset_id = charbuf[3];
2677               if (preferred_charset_id >= 0
2678                   && NILP (Fmemq (make_number (preferred_charset_id),
2679                                   charset_list)))
2680                 preferred_charset_id = -1;
2681               break;
2682             default:
2683               emacs_abort ();
2684             }
2685           charbuf += -c - 1;
2686           continue;
2687         }
2688
2689       if (ASCII_CHAR_P (c))
2690         EMIT_ONE_ASCII_BYTE (c);
2691       else if (CHAR_BYTE8_P (c))
2692         {
2693           c = CHAR_TO_BYTE8 (c);
2694           EMIT_ONE_BYTE (c);
2695         }
2696       else
2697         {
2698           struct charset *charset;
2699           unsigned code;
2700           int dimension;
2701           int emacs_mule_id;
2702           unsigned char leading_codes[2];
2703
2704           if (preferred_charset_id >= 0)
2705             {
2706               bool result;
2707
2708               charset = CHARSET_FROM_ID (preferred_charset_id);
2709               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2710               if (result)
2711                 code = ENCODE_CHAR (charset, c);
2712               else
2713                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2714                                      &code, charset);
2715             }
2716           else
2717             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2718                                  &code, charset);
2719           if (! charset)
2720             {
2721               c = coding->default_char;
2722               if (ASCII_CHAR_P (c))
2723                 {
2724                   EMIT_ONE_ASCII_BYTE (c);
2725                   continue;
2726                 }
2727               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2728                                    &code, charset);
2729             }
2730           dimension = CHARSET_DIMENSION (charset);
2731           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2732           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2733           EMIT_ONE_BYTE (leading_codes[0]);
2734           if (leading_codes[1])
2735             EMIT_ONE_BYTE (leading_codes[1]);
2736           if (dimension == 1)
2737             EMIT_ONE_BYTE (code | 0x80);
2738           else
2739             {
2740               code |= 0x8080;
2741               EMIT_ONE_BYTE (code >> 8);
2742               EMIT_ONE_BYTE (code & 0xFF);
2743             }
2744         }
2745     }
2746   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2747   coding->produced_char += produced_chars;
2748   coding->produced = dst - coding->destination;
2749   return 0;
2750 }
2751
2752 \f
2753 /*** 7. ISO2022 handlers ***/
2754
2755 /* The following note describes the coding system ISO2022 briefly.
2756    Since the intention of this note is to help understand the
2757    functions in this file, some parts are NOT ACCURATE or are OVERLY
2758    SIMPLIFIED.  For thorough understanding, please refer to the
2759    original document of ISO2022.  This is equivalent to the standard
2760    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2761
2762    ISO2022 provides many mechanisms to encode several character sets
2763    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2764    is encoded using bytes less than 128.  This may make the encoded
2765    text a little bit longer, but the text passes more easily through
2766    several types of gateway, some of which strip off the MSB (Most
2767    Significant Bit).
2768
2769    There are two kinds of character sets: control character sets and
2770    graphic character sets.  The former contain control characters such
2771    as `newline' and `escape' to provide control functions (control
2772    functions are also provided by escape sequences).  The latter
2773    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2774    two control character sets and many graphic character sets.
2775
2776    Graphic character sets are classified into one of the following
2777    four classes, according to the number of bytes (DIMENSION) and
2778    number of characters in one dimension (CHARS) of the set:
2779    - DIMENSION1_CHARS94
2780    - DIMENSION1_CHARS96
2781    - DIMENSION2_CHARS94
2782    - DIMENSION2_CHARS96
2783
2784    In addition, each character set is assigned an identification tag,
2785    unique for each set, called the "final character" (denoted as <F>
2786    hereafter).  The <F> of each character set is decided by ECMA(*)
2787    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2788    (0x30..0x3F are for private use only).
2789
2790    Note (*): ECMA = European Computer Manufacturers Association
2791
2792    Here are examples of graphic character sets [NAME(<F>)]:
2793         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2794         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2795         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2796         o DIMENSION2_CHARS96 -- none for the moment
2797
2798    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2799         C0 [0x00..0x1F] -- control character plane 0
2800         GL [0x20..0x7F] -- graphic character plane 0
2801         C1 [0x80..0x9F] -- control character plane 1
2802         GR [0xA0..0xFF] -- graphic character plane 1
2803
2804    A control character set is directly designated and invoked to C0 or
2805    C1 by an escape sequence.  The most common case is that:
2806    - ISO646's  control character set is designated/invoked to C0, and
2807    - ISO6429's control character set is designated/invoked to C1,
2808    and usually these designations/invocations are omitted in encoded
2809    text.  In a 7-bit environment, only C0 can be used, and a control
2810    character for C1 is encoded by an appropriate escape sequence to
2811    fit into the environment.  All control characters for C1 are
2812    defined to have corresponding escape sequences.
2813
2814    A graphic character set is at first designated to one of four
2815    graphic registers (G0 through G3), then these graphic registers are
2816    invoked to GL or GR.  These designations and invocations can be
2817    done independently.  The most common case is that G0 is invoked to
2818    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2819    these invocations and designations are omitted in encoded text.
2820    In a 7-bit environment, only GL can be used.
2821
2822    When a graphic character set of CHARS94 is invoked to GL, codes
2823    0x20 and 0x7F of the GL area work as control characters SPACE and
2824    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2825    be used.
2826
2827    There are two ways of invocation: locking-shift and single-shift.
2828    With locking-shift, the invocation lasts until the next different
2829    invocation, whereas with single-shift, the invocation affects the
2830    following character only and doesn't affect the locking-shift
2831    state.  Invocations are done by the following control characters or
2832    escape sequences:
2833
2834    ----------------------------------------------------------------------
2835    abbrev  function                  cntrl escape seq   description
2836    ----------------------------------------------------------------------
2837    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2838    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2839    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2840    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2841    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2842    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2843    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2844    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2845    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2846    ----------------------------------------------------------------------
2847    (*) These are not used by any known coding system.
2848
2849    Control characters for these functions are defined by macros
2850    ISO_CODE_XXX in `coding.h'.
2851
2852    Designations are done by the following escape sequences:
2853    ----------------------------------------------------------------------
2854    escape sequence      description
2855    ----------------------------------------------------------------------
2856    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2857    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2858    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2859    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2860    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2861    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2862    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2863    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2864    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2865    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2866    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2867    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2868    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2869    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2870    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2871    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2872    ----------------------------------------------------------------------
2873
2874    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2875    of dimension 1, chars 94, and final character <F>, etc...
2876
2877    Note (*): Although these designations are not allowed in ISO2022,
2878    Emacs accepts them on decoding, and produces them on encoding
2879    CHARS96 character sets in a coding system which is characterized as
2880    7-bit environment, non-locking-shift, and non-single-shift.
2881
2882    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2883    '(' must be omitted.  We refer to this as "short-form" hereafter.
2884
2885    Now you may notice that there are a lot of ways of encoding the
2886    same multilingual text in ISO2022.  Actually, there exist many
2887    coding systems such as Compound Text (used in X11's inter client
2888    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2889    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2890    localized platforms), and all of these are variants of ISO2022.
2891
2892    In addition to the above, Emacs handles two more kinds of escape
2893    sequences: ISO6429's direction specification and Emacs' private
2894    sequence for specifying character composition.
2895
2896    ISO6429's direction specification takes the following form:
2897         o CSI ']'      -- end of the current direction
2898         o CSI '0' ']'  -- end of the current direction
2899         o CSI '1' ']'  -- start of left-to-right text
2900         o CSI '2' ']'  -- start of right-to-left text
2901    The control character CSI (0x9B: control sequence introducer) is
2902    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2903
2904    Character composition specification takes the following form:
2905         o ESC '0' -- start relative composition
2906         o ESC '1' -- end composition
2907         o ESC '2' -- start rule-base composition (*)
2908         o ESC '3' -- start relative composition with alternate chars  (**)
2909         o ESC '4' -- start rule-base composition with alternate chars  (**)
2910   Since these are not standard escape sequences of any ISO standard,
2911   the use of them with these meanings is restricted to Emacs only.
2912
2913   (*) This form is used only in Emacs 20.7 and older versions,
2914   but newer versions can safely decode it.
2915   (**) This form is used only in Emacs 21.1 and newer versions,
2916   and older versions can't decode it.
2917
2918   Here's a list of example usages of these composition escape
2919   sequences (categorized by `enum composition_method').
2920
2921   COMPOSITION_RELATIVE:
2922         ESC 0 CHAR [ CHAR ] ESC 1
2923   COMPOSITION_WITH_RULE:
2924         ESC 2 CHAR [ RULE CHAR ] ESC 1
2925   COMPOSITION_WITH_ALTCHARS:
2926         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2927   COMPOSITION_WITH_RULE_ALTCHARS:
2928         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2929
2930 static enum iso_code_class_type iso_code_class[256];
2931
2932 #define SAFE_CHARSET_P(coding, id)      \
2933   ((id) <= (coding)->max_charset_id     \
2934    && (coding)->safe_charsets[id] != 255)
2935
2936 static void
2937 setup_iso_safe_charsets (Lisp_Object attrs)
2938 {
2939   Lisp_Object charset_list, safe_charsets;
2940   Lisp_Object request;
2941   Lisp_Object reg_usage;
2942   Lisp_Object tail;
2943   EMACS_INT reg94, reg96;
2944   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2945   int max_charset_id;
2946
2947   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2948   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2949       && ! EQ (charset_list, Viso_2022_charset_list))
2950     {
2951       charset_list = Viso_2022_charset_list;
2952       ASET (attrs, coding_attr_charset_list, charset_list);
2953       ASET (attrs, coding_attr_safe_charsets, Qnil);
2954     }
2955
2956   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2957     return;
2958
2959   max_charset_id = 0;
2960   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2961     {
2962       int id = XINT (XCAR (tail));
2963       if (max_charset_id < id)
2964         max_charset_id = id;
2965     }
2966
2967   safe_charsets = make_uninit_string (max_charset_id + 1);
2968   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2969   request = AREF (attrs, coding_attr_iso_request);
2970   reg_usage = AREF (attrs, coding_attr_iso_usage);
2971   reg94 = XINT (XCAR (reg_usage));
2972   reg96 = XINT (XCDR (reg_usage));
2973
2974   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2975     {
2976       Lisp_Object id;
2977       Lisp_Object reg;
2978       struct charset *charset;
2979
2980       id = XCAR (tail);
2981       charset = CHARSET_FROM_ID (XINT (id));
2982       reg = Fcdr (Fassq (id, request));
2983       if (! NILP (reg))
2984         SSET (safe_charsets, XINT (id), XINT (reg));
2985       else if (charset->iso_chars_96)
2986         {
2987           if (reg96 < 4)
2988             SSET (safe_charsets, XINT (id), reg96);
2989         }
2990       else
2991         {
2992           if (reg94 < 4)
2993             SSET (safe_charsets, XINT (id), reg94);
2994         }
2995     }
2996   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2997 }
2998
2999
3000 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3001    Return true if a text is encoded in one of ISO-2022 based coding
3002    systems.  */
3003
3004 static bool
3005 detect_coding_iso_2022 (struct coding_system *coding,
3006                         struct coding_detection_info *detect_info)
3007 {
3008   const unsigned char *src = coding->source, *src_base = src;
3009   const unsigned char *src_end = coding->source + coding->src_bytes;
3010   bool multibytep = coding->src_multibyte;
3011   bool single_shifting = 0;
3012   int id;
3013   int c, c1;
3014   ptrdiff_t consumed_chars = 0;
3015   int i;
3016   int rejected = 0;
3017   int found = 0;
3018   int composition_count = -1;
3019
3020   detect_info->checked |= CATEGORY_MASK_ISO;
3021
3022   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3023     {
3024       struct coding_system *this = &(coding_categories[i]);
3025       Lisp_Object attrs, val;
3026
3027       if (this->id < 0)
3028         continue;
3029       attrs = CODING_ID_ATTRS (this->id);
3030       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3031           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3032         setup_iso_safe_charsets (attrs);
3033       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3034       this->max_charset_id = SCHARS (val) - 1;
3035       this->safe_charsets = SDATA (val);
3036     }
3037
3038   /* A coding system of this category is always ASCII compatible.  */
3039   src += coding->head_ascii;
3040
3041   while (rejected != CATEGORY_MASK_ISO)
3042     {
3043       src_base = src;
3044       ONE_MORE_BYTE (c);
3045       switch (c)
3046         {
3047         case ISO_CODE_ESC:
3048           if (inhibit_iso_escape_detection)
3049             break;
3050           single_shifting = 0;
3051           ONE_MORE_BYTE (c);
3052           if (c == 'N' || c == 'O')
3053             {
3054               /* ESC <Fe> for SS2 or SS3.  */
3055               single_shifting = 1;
3056               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3057             }
3058           else if (c == '1')
3059             {
3060               /* End of composition.  */
3061               if (composition_count < 0
3062                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3063                 /* Invalid */
3064                 break;
3065               composition_count = -1;
3066               found |= CATEGORY_MASK_ISO;
3067             }
3068           else if (c >= '0' && c <= '4')
3069             {
3070               /* ESC <Fp> for start/end composition.  */
3071               composition_count = 0;
3072             }
3073           else
3074             {
3075               if (c >= '(' && c <= '/')
3076                 {
3077                   /* Designation sequence for a charset of dimension 1.  */
3078                   ONE_MORE_BYTE (c1);
3079                   if (c1 < ' ' || c1 >= 0x80
3080                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3081                     /* Invalid designation sequence.  Just ignore.  */
3082                     break;
3083                 }
3084               else if (c == '$')
3085                 {
3086                   /* Designation sequence for a charset of dimension 2.  */
3087                   ONE_MORE_BYTE (c);
3088                   if (c >= '@' && c <= 'B')
3089                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3090                     id = iso_charset_table[1][0][c];
3091                   else if (c >= '(' && c <= '/')
3092                     {
3093                       ONE_MORE_BYTE (c1);
3094                       if (c1 < ' ' || c1 >= 0x80
3095                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3096                         /* Invalid designation sequence.  Just ignore.  */
3097                         break;
3098                     }
3099                   else
3100                     /* Invalid designation sequence.  Just ignore it.  */
3101                     break;
3102                 }
3103               else
3104                 {
3105                   /* Invalid escape sequence.  Just ignore it.  */
3106                   break;
3107                 }
3108
3109               /* We found a valid designation sequence for CHARSET.  */
3110               rejected |= CATEGORY_MASK_ISO_8BIT;
3111               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3112                                   id))
3113                 found |= CATEGORY_MASK_ISO_7;
3114               else
3115                 rejected |= CATEGORY_MASK_ISO_7;
3116               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3117                                   id))
3118                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3119               else
3120                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3121               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3122                                   id))
3123                 found |= CATEGORY_MASK_ISO_7_ELSE;
3124               else
3125                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3126               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3127                                   id))
3128                 found |= CATEGORY_MASK_ISO_8_ELSE;
3129               else
3130                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3131             }
3132           break;
3133
3134         case ISO_CODE_SO:
3135         case ISO_CODE_SI:
3136           /* Locking shift out/in.  */
3137           if (inhibit_iso_escape_detection)
3138             break;
3139           single_shifting = 0;
3140           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3141           break;
3142
3143         case ISO_CODE_CSI:
3144           /* Control sequence introducer.  */
3145           single_shifting = 0;
3146           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3147           found |= CATEGORY_MASK_ISO_8_ELSE;
3148           goto check_extra_latin;
3149
3150         case ISO_CODE_SS2:
3151         case ISO_CODE_SS3:
3152           /* Single shift.   */
3153           if (inhibit_iso_escape_detection)
3154             break;
3155           single_shifting = 0;
3156           rejected |= CATEGORY_MASK_ISO_7BIT;
3157           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3158               & CODING_ISO_FLAG_SINGLE_SHIFT)
3159             {
3160               found |= CATEGORY_MASK_ISO_8_1;
3161               single_shifting = 1;
3162             }
3163           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3164               & CODING_ISO_FLAG_SINGLE_SHIFT)
3165             {
3166               found |= CATEGORY_MASK_ISO_8_2;
3167               single_shifting = 1;
3168             }
3169           if (single_shifting)
3170             break;
3171           goto check_extra_latin;
3172
3173         default:
3174           if (c < 0)
3175             continue;
3176           if (c < 0x80)
3177             {
3178               if (composition_count >= 0)
3179                 composition_count++;
3180               single_shifting = 0;
3181               break;
3182             }
3183           if (c >= 0xA0)
3184             {
3185               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3186               found |= CATEGORY_MASK_ISO_8_1;
3187               /* Check the length of succeeding codes of the range
3188                  0xA0..0FF.  If the byte length is even, we include
3189                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3190                  only when we are not single shifting.  */
3191               if (! single_shifting
3192                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3193                 {
3194                   ptrdiff_t len = 1;
3195                   while (src < src_end)
3196                     {
3197                       src_base = src;
3198                       ONE_MORE_BYTE (c);
3199                       if (c < 0xA0)
3200                         {
3201                           src = src_base;
3202                           break;
3203                         }
3204                       len++;
3205                     }
3206
3207                   if (len & 1 && src < src_end)
3208                     {
3209                       rejected |= CATEGORY_MASK_ISO_8_2;
3210                       if (composition_count >= 0)
3211                         composition_count += len;
3212                     }
3213                   else
3214                     {
3215                       found |= CATEGORY_MASK_ISO_8_2;
3216                       if (composition_count >= 0)
3217                         composition_count += len / 2;
3218                     }
3219                 }
3220               break;
3221             }
3222         check_extra_latin:
3223           if (! VECTORP (Vlatin_extra_code_table)
3224               || NILP (AREF (Vlatin_extra_code_table, c)))
3225             {
3226               rejected = CATEGORY_MASK_ISO;
3227               break;
3228             }
3229           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3230               & CODING_ISO_FLAG_LATIN_EXTRA)
3231             found |= CATEGORY_MASK_ISO_8_1;
3232           else
3233             rejected |= CATEGORY_MASK_ISO_8_1;
3234           rejected |= CATEGORY_MASK_ISO_8_2;
3235           break;
3236         }
3237     }
3238   detect_info->rejected |= CATEGORY_MASK_ISO;
3239   return 0;
3240
3241  no_more_source:
3242   detect_info->rejected |= rejected;
3243   detect_info->found |= (found & ~rejected);
3244   return 1;
3245 }
3246
3247
3248 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3249    escape sequence should be kept.  */
3250 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3251   do {                                                                  \
3252     int id, prev;                                                       \
3253                                                                         \
3254     if (final < '0' || final >= 128                                     \
3255         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3256         || !SAFE_CHARSET_P (coding, id))                                \
3257       {                                                                 \
3258         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3259         chars_96 = -1;                                                  \
3260         break;                                                          \
3261       }                                                                 \
3262     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3263     if (id == charset_jisx0201_roman)                                   \
3264       {                                                                 \
3265         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3266           id = charset_ascii;                                           \
3267       }                                                                 \
3268     else if (id == charset_jisx0208_1978)                               \
3269       {                                                                 \
3270         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3271           id = charset_jisx0208;                                        \
3272       }                                                                 \
3273     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3274     /* If there was an invalid designation to REG previously, and this  \
3275        designation is ASCII to REG, we should keep this designation     \
3276        sequence.  */                                                    \
3277     if (prev == -2 && id == charset_ascii)                              \
3278       chars_96 = -1;                                                    \
3279   } while (0)
3280
3281
3282 /* Handle these composition sequence (ALT: alternate char):
3283
3284    (1) relative composition: ESC 0 CHAR ... ESC 1
3285    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3286    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3287    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3288
3289    When the start sequence (ESC 0/2/3/4) is found, this annotation
3290    header is produced.
3291
3292         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3293
3294    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3295    produced until the end sequence (ESC 1) is found:
3296
3297    (1) CHAR ... CHAR
3298    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3299    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3300    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3301
3302    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3303    annotation header is updated as below:
3304
3305    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3306    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3307    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3308    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3309
3310    If an error is found while composing, the annotation header is
3311    changed to:
3312
3313         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3314
3315    and the sequence [ -2 DECODED-RULE ] is changed to the original
3316    byte sequence as below:
3317         o the original byte sequence is B: [ B -1 ]
3318         o the original byte sequence is B1 B2: [ B1 B2 ]
3319    and the sequence [ -1 -1 ] is changed to the original byte
3320    sequence:
3321         [ ESC '0' ]
3322 */
3323
3324 /* Decode a composition rule C1 and maybe one more byte from the
3325    source, and set RULE to the encoded composition rule.  If the rule
3326    is invalid, goto invalid_code.  */
3327
3328 #define DECODE_COMPOSITION_RULE(rule)                                   \
3329   do {                                                                  \
3330     rule = c1 - 32;                                                     \
3331     if (rule < 0)                                                       \
3332       goto invalid_code;                                                \
3333     if (rule < 81)              /* old format (before ver.21) */        \
3334       {                                                                 \
3335         int gref = (rule) / 9;                                          \
3336         int nref = (rule) % 9;                                          \
3337         if (gref == 4) gref = 10;                                       \
3338         if (nref == 4) nref = 10;                                       \
3339         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3340       }                                                                 \
3341     else                        /* new format (after ver.21) */         \
3342       {                                                                 \
3343         int b;                                                          \
3344                                                                         \
3345         ONE_MORE_BYTE (b);                                              \
3346         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3347           goto invalid_code;                                            \
3348         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3349         rule += 0x100;   /* Distinguish it from the old format.  */     \
3350       }                                                                 \
3351   } while (0)
3352
3353 #define ENCODE_COMPOSITION_RULE(rule)                           \
3354   do {                                                          \
3355     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3356                                                                 \
3357     if (rule < 0x100)           /* old format */                \
3358       {                                                         \
3359         if (gref == 10) gref = 4;                               \
3360         if (nref == 10) nref = 4;                               \
3361         charbuf[idx] = 32 + gref * 9 + nref;                    \
3362         charbuf[idx + 1] = -1;                                  \
3363         new_chars++;                                            \
3364       }                                                         \
3365     else                                /* new format */        \
3366       {                                                         \
3367         charbuf[idx] = 32 + 81 + gref;                          \
3368         charbuf[idx + 1] = 32 + nref;                           \
3369         new_chars += 2;                                         \
3370       }                                                         \
3371   } while (0)
3372
3373 /* Finish the current composition as invalid.  */
3374
3375 static int
3376 finish_composition (int *charbuf, struct composition_status *cmp_status)
3377 {
3378   int idx = - cmp_status->length;
3379   int new_chars;
3380
3381   /* Recover the original ESC sequence */
3382   charbuf[idx++] = ISO_CODE_ESC;
3383   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3384                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3385                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3386                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3387                     : '4');
3388   charbuf[idx++] = -2;
3389   charbuf[idx++] = 0;
3390   charbuf[idx++] = -1;
3391   new_chars = cmp_status->nchars;
3392   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3393     for (; idx < 0; idx++)
3394       {
3395         int elt = charbuf[idx];
3396
3397         if (elt == -2)
3398           {
3399             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3400             idx++;
3401           }
3402         else if (elt == -1)
3403           {
3404             charbuf[idx++] = ISO_CODE_ESC;
3405             charbuf[idx] = '0';
3406             new_chars += 2;
3407           }
3408       }
3409   cmp_status->state = COMPOSING_NO;
3410   return new_chars;
3411 }
3412
3413 /* If characters are under composition, finish the composition.  */
3414 #define MAYBE_FINISH_COMPOSITION()                              \
3415   do {                                                          \
3416     if (cmp_status->state != COMPOSING_NO)                      \
3417       char_offset += finish_composition (charbuf, cmp_status);  \
3418   } while (0)
3419
3420 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3421
3422    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3423    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3424    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3425    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3426
3427    Produce this annotation sequence now:
3428
3429    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3430 */
3431
3432 #define DECODE_COMPOSITION_START(c1)                                       \
3433   do {                                                                     \
3434     if (c1 == '0'                                                          \
3435         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3436              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3437             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3438                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3439       {                                                                    \
3440         *charbuf++ = -1;                                                   \
3441         *charbuf++= -1;                                                    \
3442         cmp_status->state = COMPOSING_CHAR;                                \
3443         cmp_status->length += 2;                                           \
3444       }                                                                    \
3445     else                                                                   \
3446       {                                                                    \
3447         MAYBE_FINISH_COMPOSITION ();                                       \
3448         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3449                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3450                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3451                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3452         cmp_status->state                                                  \
3453           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3454         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3455         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3456         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3457         coding->annotated = 1;                                             \
3458       }                                                                    \
3459   } while (0)
3460
3461
3462 /* Handle composition end sequence ESC 1.  */
3463
3464 #define DECODE_COMPOSITION_END()                                        \
3465   do {                                                                  \
3466     if (cmp_status->nchars == 0                                         \
3467         || ((cmp_status->state == COMPOSING_CHAR)                       \
3468             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3469       {                                                                 \
3470         MAYBE_FINISH_COMPOSITION ();                                    \
3471         goto invalid_code;                                              \
3472       }                                                                 \
3473     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3474       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3475     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3476       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3477     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3478     char_offset += cmp_status->nchars;                                  \
3479     cmp_status->state = COMPOSING_NO;                                   \
3480   } while (0)
3481
3482 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3483
3484 #define STORE_COMPOSITION_RULE(rule)    \
3485   do {                                  \
3486     *charbuf++ = -2;                    \
3487     *charbuf++ = rule;                  \
3488     cmp_status->length += 2;            \
3489     cmp_status->state--;                \
3490   } while (0)
3491
3492 /* Store a composed char or a component char C in charbuf, and update
3493    cmp_status.  */
3494
3495 #define STORE_COMPOSITION_CHAR(c)                                       \
3496   do {                                                                  \
3497     *charbuf++ = (c);                                                   \
3498     cmp_status->length++;                                               \
3499     if (cmp_status->state == COMPOSING_CHAR)                            \
3500       cmp_status->nchars++;                                             \
3501     else                                                                \
3502       cmp_status->ncomps++;                                             \
3503     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3504         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3505             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3506       cmp_status->state++;                                              \
3507   } while (0)
3508
3509
3510 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3511
3512 static void
3513 decode_coding_iso_2022 (struct coding_system *coding)
3514 {
3515   const unsigned char *src = coding->source + coding->consumed;
3516   const unsigned char *src_end = coding->source + coding->src_bytes;
3517   const unsigned char *src_base;
3518   int *charbuf = coding->charbuf + coding->charbuf_used;
3519   /* We may produce two annotations (charset and composition) in one
3520      loop and one more charset annotation at the end.  */
3521   int *charbuf_end
3522     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3523   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3524   bool multibytep = coding->src_multibyte;
3525   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3526   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3527   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3528   int charset_id_2, charset_id_3;
3529   struct charset *charset;
3530   int c;
3531   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3532   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3533   ptrdiff_t char_offset = coding->produced_char;
3534   ptrdiff_t last_offset = char_offset;
3535   int last_id = charset_ascii;
3536   bool eol_dos
3537     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3538   int byte_after_cr = -1;
3539   int i;
3540
3541   setup_iso_safe_charsets (attrs);
3542   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3543
3544   if (cmp_status->state != COMPOSING_NO)
3545     {
3546       if (charbuf_end - charbuf < cmp_status->length)
3547         emacs_abort ();
3548       for (i = 0; i < cmp_status->length; i++)
3549         *charbuf++ = cmp_status->carryover[i];
3550       coding->annotated = 1;
3551     }
3552
3553   while (1)
3554     {
3555       int c1, c2, c3;
3556
3557       src_base = src;
3558       consumed_chars_base = consumed_chars;
3559
3560       if (charbuf >= charbuf_end)
3561         {
3562           if (byte_after_cr >= 0)
3563             src_base--;
3564           break;
3565         }
3566
3567       if (byte_after_cr >= 0)
3568         c1 = byte_after_cr, byte_after_cr = -1;
3569       else
3570         ONE_MORE_BYTE (c1);
3571       if (c1 < 0)
3572         goto invalid_code;
3573
3574       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3575         {
3576           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3577           char_offset++;
3578           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3579           continue;
3580         }
3581
3582       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3583         {
3584           if (c1 == ISO_CODE_ESC)
3585             {
3586               if (src + 1 >= src_end)
3587                 goto no_more_source;
3588               *charbuf++ = ISO_CODE_ESC;
3589               char_offset++;
3590               if (src[0] == '%' && src[1] == '@')
3591                 {
3592                   src += 2;
3593                   consumed_chars += 2;
3594                   char_offset += 2;
3595                   /* We are sure charbuf can contain two more chars. */
3596                   *charbuf++ = '%';
3597                   *charbuf++ = '@';
3598                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3599                 }
3600             }
3601           else
3602             {
3603               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3604               char_offset++;
3605             }
3606           continue;
3607         }
3608
3609       if ((cmp_status->state == COMPOSING_RULE
3610            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3611           && c1 != ISO_CODE_ESC)
3612         {
3613           int rule;
3614
3615           DECODE_COMPOSITION_RULE (rule);
3616           STORE_COMPOSITION_RULE (rule);
3617           continue;
3618         }
3619
3620       /* We produce at most one character.  */
3621       switch (iso_code_class [c1])
3622         {
3623         case ISO_0x20_or_0x7F:
3624           if (charset_id_0 < 0
3625               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3626             /* This is SPACE or DEL.  */
3627             charset = CHARSET_FROM_ID (charset_ascii);
3628           else
3629             charset = CHARSET_FROM_ID (charset_id_0);
3630           break;
3631
3632         case ISO_graphic_plane_0:
3633           if (charset_id_0 < 0)
3634             charset = CHARSET_FROM_ID (charset_ascii);
3635           else
3636             charset = CHARSET_FROM_ID (charset_id_0);
3637           break;
3638
3639         case ISO_0xA0_or_0xFF:
3640           if (charset_id_1 < 0
3641               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3642               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3643             goto invalid_code;
3644           /* This is a graphic character, we fall down ... */
3645
3646         case ISO_graphic_plane_1:
3647           if (charset_id_1 < 0)
3648             goto invalid_code;
3649           charset = CHARSET_FROM_ID (charset_id_1);
3650           break;
3651
3652         case ISO_control_0:
3653           if (eol_dos && c1 == '\r')
3654             ONE_MORE_BYTE (byte_after_cr);
3655           MAYBE_FINISH_COMPOSITION ();
3656           charset = CHARSET_FROM_ID (charset_ascii);
3657           break;
3658
3659         case ISO_control_1:
3660           goto invalid_code;
3661
3662         case ISO_shift_out:
3663           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3664               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3665             goto invalid_code;
3666           CODING_ISO_INVOCATION (coding, 0) = 1;
3667           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3668           continue;
3669
3670         case ISO_shift_in:
3671           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3672             goto invalid_code;
3673           CODING_ISO_INVOCATION (coding, 0) = 0;
3674           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3675           continue;
3676
3677         case ISO_single_shift_2_7:
3678           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3679             goto invalid_code;
3680         case ISO_single_shift_2:
3681           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3682             goto invalid_code;
3683           /* SS2 is handled as an escape sequence of ESC 'N' */
3684           c1 = 'N';
3685           goto label_escape_sequence;
3686
3687         case ISO_single_shift_3:
3688           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3689             goto invalid_code;
3690           /* SS2 is handled as an escape sequence of ESC 'O' */
3691           c1 = 'O';
3692           goto label_escape_sequence;
3693
3694         case ISO_control_sequence_introducer:
3695           /* CSI is handled as an escape sequence of ESC '[' ...  */
3696           c1 = '[';
3697           goto label_escape_sequence;
3698
3699         case ISO_escape:
3700           ONE_MORE_BYTE (c1);
3701         label_escape_sequence:
3702           /* Escape sequences handled here are invocation,
3703              designation, direction specification, and character
3704              composition specification.  */
3705           switch (c1)
3706             {
3707             case '&':           /* revision of following character set */
3708               ONE_MORE_BYTE (c1);
3709               if (!(c1 >= '@' && c1 <= '~'))
3710                 goto invalid_code;
3711               ONE_MORE_BYTE (c1);
3712               if (c1 != ISO_CODE_ESC)
3713                 goto invalid_code;
3714               ONE_MORE_BYTE (c1);
3715               goto label_escape_sequence;
3716
3717             case '$':           /* designation of 2-byte character set */
3718               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3719                 goto invalid_code;
3720               {
3721                 int reg, chars96;
3722
3723                 ONE_MORE_BYTE (c1);
3724                 if (c1 >= '@' && c1 <= 'B')
3725                   {     /* designation of JISX0208.1978, GB2312.1980,
3726                            or JISX0208.1980 */
3727                     reg = 0, chars96 = 0;
3728                   }
3729                 else if (c1 >= 0x28 && c1 <= 0x2B)
3730                   { /* designation of DIMENSION2_CHARS94 character set */
3731                     reg = c1 - 0x28, chars96 = 0;
3732                     ONE_MORE_BYTE (c1);
3733                   }
3734                 else if (c1 >= 0x2C && c1 <= 0x2F)
3735                   { /* designation of DIMENSION2_CHARS96 character set */
3736                     reg = c1 - 0x2C, chars96 = 1;
3737                     ONE_MORE_BYTE (c1);
3738                   }
3739                 else
3740                   goto invalid_code;
3741                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3742                 /* We must update these variables now.  */
3743                 if (reg == 0)
3744                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3745                 else if (reg == 1)
3746                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3747                 if (chars96 < 0)
3748                   goto invalid_code;
3749               }
3750               continue;
3751
3752             case 'n':           /* invocation of locking-shift-2 */
3753               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3754                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3755                 goto invalid_code;
3756               CODING_ISO_INVOCATION (coding, 0) = 2;
3757               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3758               continue;
3759
3760             case 'o':           /* invocation of locking-shift-3 */
3761               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3762                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3763                 goto invalid_code;
3764               CODING_ISO_INVOCATION (coding, 0) = 3;
3765               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3766               continue;
3767
3768             case 'N':           /* invocation of single-shift-2 */
3769               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3770                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3771                 goto invalid_code;
3772               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3773               if (charset_id_2 < 0)
3774                 charset = CHARSET_FROM_ID (charset_ascii);
3775               else
3776                 charset = CHARSET_FROM_ID (charset_id_2);
3777               ONE_MORE_BYTE (c1);
3778               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3779                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3780                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3781                           ? c1 >= 0x80 : c1 < 0x80)))
3782                 goto invalid_code;
3783               break;
3784
3785             case 'O':           /* invocation of single-shift-3 */
3786               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3787                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3788                 goto invalid_code;
3789               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3790               if (charset_id_3 < 0)
3791                 charset = CHARSET_FROM_ID (charset_ascii);
3792               else
3793                 charset = CHARSET_FROM_ID (charset_id_3);
3794               ONE_MORE_BYTE (c1);
3795               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3796                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3797                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3798                           ? c1 >= 0x80 : c1 < 0x80)))
3799                 goto invalid_code;
3800               break;
3801
3802             case '0': case '2': case '3': case '4': /* start composition */
3803               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3804                 goto invalid_code;
3805               if (last_id != charset_ascii)
3806                 {
3807                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3808                   last_id = charset_ascii;
3809                   last_offset = char_offset;
3810                 }
3811               DECODE_COMPOSITION_START (c1);
3812               continue;
3813
3814             case '1':           /* end composition */
3815               if (cmp_status->state == COMPOSING_NO)
3816                 goto invalid_code;
3817               DECODE_COMPOSITION_END ();
3818               continue;
3819
3820             case '[':           /* specification of direction */
3821               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3822                 goto invalid_code;
3823               /* For the moment, nested direction is not supported.
3824                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3825                  left-to-right, and nonzero means right-to-left.  */
3826               ONE_MORE_BYTE (c1);
3827               switch (c1)
3828                 {
3829                 case ']':       /* end of the current direction */
3830                   coding->mode &= ~CODING_MODE_DIRECTION;
3831
3832                 case '0':       /* end of the current direction */
3833                 case '1':       /* start of left-to-right direction */
3834                   ONE_MORE_BYTE (c1);
3835                   if (c1 == ']')
3836                     coding->mode &= ~CODING_MODE_DIRECTION;
3837                   else
3838                     goto invalid_code;
3839                   break;
3840
3841                 case '2':       /* start of right-to-left direction */
3842                   ONE_MORE_BYTE (c1);
3843                   if (c1 == ']')
3844                     coding->mode |= CODING_MODE_DIRECTION;
3845                   else
3846                     goto invalid_code;
3847                   break;
3848
3849                 default:
3850                   goto invalid_code;
3851                 }
3852               continue;
3853
3854             case '%':
3855               ONE_MORE_BYTE (c1);
3856               if (c1 == '/')
3857                 {
3858                   /* CTEXT extended segment:
3859                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3860                      We keep these bytes as is for the moment.
3861                      They may be decoded by post-read-conversion.  */
3862                   int dim, M, L;
3863                   int size;
3864
3865                   ONE_MORE_BYTE (dim);
3866                   if (dim < '0' || dim > '4')
3867                     goto invalid_code;
3868                   ONE_MORE_BYTE (M);
3869                   if (M < 128)
3870                     goto invalid_code;
3871                   ONE_MORE_BYTE (L);
3872                   if (L < 128)
3873                     goto invalid_code;
3874                   size = ((M - 128) * 128) + (L - 128);
3875                   if (charbuf + 6 > charbuf_end)
3876                     goto break_loop;
3877                   *charbuf++ = ISO_CODE_ESC;
3878                   *charbuf++ = '%';
3879                   *charbuf++ = '/';
3880                   *charbuf++ = dim;
3881                   *charbuf++ = BYTE8_TO_CHAR (M);
3882                   *charbuf++ = BYTE8_TO_CHAR (L);
3883                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3884                 }
3885               else if (c1 == 'G')
3886                 {
3887                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3888                      ESC % G --UTF-8-BYTES-- ESC % @
3889                      We keep these bytes as is for the moment.
3890                      They may be decoded by post-read-conversion.  */
3891                   if (charbuf + 3 > charbuf_end)
3892                     goto break_loop;
3893                   *charbuf++ = ISO_CODE_ESC;
3894                   *charbuf++ = '%';
3895                   *charbuf++ = 'G';
3896                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3897                 }
3898               else
3899                 goto invalid_code;
3900               continue;
3901               break;
3902
3903             default:
3904               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3905                 goto invalid_code;
3906               {
3907                 int reg, chars96;
3908
3909                 if (c1 >= 0x28 && c1 <= 0x2B)
3910                   { /* designation of DIMENSION1_CHARS94 character set */
3911                     reg = c1 - 0x28, chars96 = 0;
3912                     ONE_MORE_BYTE (c1);
3913                   }
3914                 else if (c1 >= 0x2C && c1 <= 0x2F)
3915                   { /* designation of DIMENSION1_CHARS96 character set */
3916                     reg = c1 - 0x2C, chars96 = 1;
3917                     ONE_MORE_BYTE (c1);
3918                   }
3919                 else
3920                   goto invalid_code;
3921                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3922                 /* We must update these variables now.  */
3923                 if (reg == 0)
3924                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3925                 else if (reg == 1)
3926                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3927                 if (chars96 < 0)
3928                   goto invalid_code;
3929               }
3930               continue;
3931             }
3932           break;
3933
3934         default:
3935           emacs_abort ();
3936         }
3937
3938       if (cmp_status->state == COMPOSING_NO
3939           && charset->id != charset_ascii
3940           && last_id != charset->id)
3941         {
3942           if (last_id != charset_ascii)
3943             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3944           last_id = charset->id;
3945           last_offset = char_offset;
3946         }
3947
3948       /* Now we know CHARSET and 1st position code C1 of a character.
3949          Produce a decoded character while getting 2nd and 3rd
3950          position codes C2, C3 if necessary.  */
3951       if (CHARSET_DIMENSION (charset) > 1)
3952         {
3953           ONE_MORE_BYTE (c2);
3954           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3955               || ((c1 & 0x80) != (c2 & 0x80)))
3956             /* C2 is not in a valid range.  */
3957             goto invalid_code;
3958           if (CHARSET_DIMENSION (charset) == 2)
3959             c1 = (c1 << 8) | c2;
3960           else
3961             {
3962               ONE_MORE_BYTE (c3);
3963               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3964                   || ((c1 & 0x80) != (c3 & 0x80)))
3965                 /* C3 is not in a valid range.  */
3966                 goto invalid_code;
3967               c1 = (c1 << 16) | (c2 << 8) | c2;
3968             }
3969         }
3970       c1 &= 0x7F7F7F;
3971       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3972       if (c < 0)
3973         {
3974           MAYBE_FINISH_COMPOSITION ();
3975           for (; src_base < src; src_base++, char_offset++)
3976             {
3977               if (ASCII_CHAR_P (*src_base))
3978                 *charbuf++ = *src_base;
3979               else
3980                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3981             }
3982         }
3983       else if (cmp_status->state == COMPOSING_NO)
3984         {
3985           *charbuf++ = c;
3986           char_offset++;
3987         }
3988       else if ((cmp_status->state == COMPOSING_CHAR
3989                 ? cmp_status->nchars
3990                 : cmp_status->ncomps)
3991                >= MAX_COMPOSITION_COMPONENTS)
3992         {
3993           /* Too long composition.  */
3994           MAYBE_FINISH_COMPOSITION ();
3995           *charbuf++ = c;
3996           char_offset++;
3997         }
3998       else
3999         STORE_COMPOSITION_CHAR (c);
4000       continue;
4001
4002     invalid_code:
4003       MAYBE_FINISH_COMPOSITION ();
4004       src = src_base;
4005       consumed_chars = consumed_chars_base;
4006       ONE_MORE_BYTE (c);
4007       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
4008       char_offset++;
4009       coding->errors++;
4010       /* Reset the invocation and designation status to the safest
4011          one; i.e. designate ASCII to the graphic register 0, and
4012          invoke that register to the graphic plane 0.  This typically
4013          helps the case that an designation sequence for ASCII "ESC (
4014          B" is somehow broken (e.g. broken by a newline).  */
4015       CODING_ISO_INVOCATION (coding, 0) = 0;
4016       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4017       charset_id_0 = charset_ascii;
4018       continue;
4019
4020     break_loop:
4021       break;
4022     }
4023
4024  no_more_source:
4025   if (cmp_status->state != COMPOSING_NO)
4026     {
4027       if (coding->mode & CODING_MODE_LAST_BLOCK)
4028         MAYBE_FINISH_COMPOSITION ();
4029       else
4030         {
4031           charbuf -= cmp_status->length;
4032           for (i = 0; i < cmp_status->length; i++)
4033             cmp_status->carryover[i] = charbuf[i];
4034         }
4035     }
4036   else if (last_id != charset_ascii)
4037     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4038   coding->consumed_char += consumed_chars_base;
4039   coding->consumed = src_base - coding->source;
4040   coding->charbuf_used = charbuf - coding->charbuf;
4041 }
4042
4043
4044 /* ISO2022 encoding stuff.  */
4045
4046 /*
4047    It is not enough to say just "ISO2022" on encoding, we have to
4048    specify more details.  In Emacs, each coding system of ISO2022
4049    variant has the following specifications:
4050         1. Initial designation to G0 thru G3.
4051         2. Allows short-form designation?
4052         3. ASCII should be designated to G0 before control characters?
4053         4. ASCII should be designated to G0 at end of line?
4054         5. 7-bit environment or 8-bit environment?
4055         6. Use locking-shift?
4056         7. Use Single-shift?
4057    And the following two are only for Japanese:
4058         8. Use ASCII in place of JIS0201-1976-Roman?
4059         9. Use JISX0208-1983 in place of JISX0208-1978?
4060    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4061    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4062    details.
4063 */
4064
4065 /* Produce codes (escape sequence) for designating CHARSET to graphic
4066    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4067    '@', 'A', or 'B' and the coding system CODING allows, produce
4068    designation sequence of short-form.  */
4069
4070 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4071   do {                                                                  \
4072     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4073     const char *intermediate_char_94 = "()*+";                          \
4074     const char *intermediate_char_96 = ",-./";                          \
4075     int revision = -1;                                                  \
4076                                                                         \
4077     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4078       revision = CHARSET_ISO_REVISION (charset);                        \
4079                                                                         \
4080     if (revision >= 0)                                                  \
4081       {                                                                 \
4082         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4083         EMIT_ONE_BYTE ('@' + revision);                                 \
4084       }                                                                 \
4085     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4086     if (CHARSET_DIMENSION (charset) == 1)                               \
4087       {                                                                 \
4088         int b;                                                          \
4089         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4090           b = intermediate_char_94[reg];                                \
4091         else                                                            \
4092           b = intermediate_char_96[reg];                                \
4093         EMIT_ONE_ASCII_BYTE (b);                                        \
4094       }                                                                 \
4095     else                                                                \
4096       {                                                                 \
4097         EMIT_ONE_ASCII_BYTE ('$');                                      \
4098         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4099           {                                                             \
4100             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4101                 || reg != 0                                             \
4102                 || final_char < '@' || final_char > 'B')                \
4103               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4104           }                                                             \
4105         else                                                            \
4106           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4107       }                                                                 \
4108     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4109                                                                         \
4110     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4111   } while (0)
4112
4113
4114 /* The following two macros produce codes (control character or escape
4115    sequence) for ISO2022 single-shift functions (single-shift-2 and
4116    single-shift-3).  */
4117
4118 #define ENCODE_SINGLE_SHIFT_2                                           \
4119   do {                                                                  \
4120     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4121       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4122     else                                                                \
4123       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4124     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4125   } while (0)
4126
4127
4128 #define ENCODE_SINGLE_SHIFT_3                                           \
4129   do {                                                                  \
4130     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4131       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4132     else                                                                \
4133       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4134     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4135   } while (0)
4136
4137
4138 /* The following four macros produce codes (control character or
4139    escape sequence) for ISO2022 locking-shift functions (shift-in,
4140    shift-out, locking-shift-2, and locking-shift-3).  */
4141
4142 #define ENCODE_SHIFT_IN                                 \
4143   do {                                                  \
4144     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4145     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4146   } while (0)
4147
4148
4149 #define ENCODE_SHIFT_OUT                                \
4150   do {                                                  \
4151     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4152     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4153   } while (0)
4154
4155
4156 #define ENCODE_LOCKING_SHIFT_2                          \
4157   do {                                                  \
4158     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4159     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4160   } while (0)
4161
4162
4163 #define ENCODE_LOCKING_SHIFT_3                          \
4164   do {                                                  \
4165     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4166     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4167   } while (0)
4168
4169
4170 /* Produce codes for a DIMENSION1 character whose character set is
4171    CHARSET and whose position-code is C1.  Designation and invocation
4172    sequences are also produced in advance if necessary.  */
4173
4174 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4175   do {                                                                  \
4176     int id = CHARSET_ID (charset);                                      \
4177                                                                         \
4178     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4179         && id == charset_ascii)                                         \
4180       {                                                                 \
4181         id = charset_jisx0201_roman;                                    \
4182         charset = CHARSET_FROM_ID (id);                                 \
4183       }                                                                 \
4184                                                                         \
4185     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4186       {                                                                 \
4187         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4188           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4189         else                                                            \
4190           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4191         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4192         break;                                                          \
4193       }                                                                 \
4194     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4195       {                                                                 \
4196         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4197         break;                                                          \
4198       }                                                                 \
4199     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4200       {                                                                 \
4201         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4202         break;                                                          \
4203       }                                                                 \
4204     else                                                                \
4205       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4206          must invoke it, or, at first, designate it to some graphic     \
4207          register.  Then repeat the loop to actually produce the        \
4208          character.  */                                                 \
4209       dst = encode_invocation_designation (charset, coding, dst,        \
4210                                            &produced_chars);            \
4211   } while (1)
4212
4213
4214 /* Produce codes for a DIMENSION2 character whose character set is
4215    CHARSET and whose position-codes are C1 and C2.  Designation and
4216    invocation codes are also produced in advance if necessary.  */
4217
4218 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4219   do {                                                                  \
4220     int id = CHARSET_ID (charset);                                      \
4221                                                                         \
4222     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4223         && id == charset_jisx0208)                                      \
4224       {                                                                 \
4225         id = charset_jisx0208_1978;                                     \
4226         charset = CHARSET_FROM_ID (id);                                 \
4227       }                                                                 \
4228                                                                         \
4229     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4230       {                                                                 \
4231         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4232           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4233         else                                                            \
4234           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4235         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4236         break;                                                          \
4237       }                                                                 \
4238     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4239       {                                                                 \
4240         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4241         break;                                                          \
4242       }                                                                 \
4243     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4244       {                                                                 \
4245         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4246         break;                                                          \
4247       }                                                                 \
4248     else                                                                \
4249       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4250          must invoke it, or, at first, designate it to some graphic     \
4251          register.  Then repeat the loop to actually produce the        \
4252          character.  */                                                 \
4253       dst = encode_invocation_designation (charset, coding, dst,        \
4254                                            &produced_chars);            \
4255   } while (1)
4256
4257
4258 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4259   do {                                                                     \
4260     unsigned code;                                                         \
4261     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4262                                                                            \
4263     if (CHARSET_DIMENSION (charset) == 1)                                  \
4264       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4265     else                                                                   \
4266       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4267   } while (0)
4268
4269
4270 /* Produce designation and invocation codes at a place pointed by DST
4271    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4272    Return new DST.  */
4273
4274 static unsigned char *
4275 encode_invocation_designation (struct charset *charset,
4276                                struct coding_system *coding,
4277                                unsigned char *dst, ptrdiff_t *p_nchars)
4278 {
4279   bool multibytep = coding->dst_multibyte;
4280   ptrdiff_t produced_chars = *p_nchars;
4281   int reg;                      /* graphic register number */
4282   int id = CHARSET_ID (charset);
4283
4284   /* At first, check designations.  */
4285   for (reg = 0; reg < 4; reg++)
4286     if (id == CODING_ISO_DESIGNATION (coding, reg))
4287       break;
4288
4289   if (reg >= 4)
4290     {
4291       /* CHARSET is not yet designated to any graphic registers.  */
4292       /* At first check the requested designation.  */
4293       reg = CODING_ISO_REQUEST (coding, id);
4294       if (reg < 0)
4295         /* Since CHARSET requests no special designation, designate it
4296            to graphic register 0.  */
4297         reg = 0;
4298
4299       ENCODE_DESIGNATION (charset, reg, coding);
4300     }
4301
4302   if (CODING_ISO_INVOCATION (coding, 0) != reg
4303       && CODING_ISO_INVOCATION (coding, 1) != reg)
4304     {
4305       /* Since the graphic register REG is not invoked to any graphic
4306          planes, invoke it to graphic plane 0.  */
4307       switch (reg)
4308         {
4309         case 0:                 /* graphic register 0 */
4310           ENCODE_SHIFT_IN;
4311           break;
4312
4313         case 1:                 /* graphic register 1 */
4314           ENCODE_SHIFT_OUT;
4315           break;
4316
4317         case 2:                 /* graphic register 2 */
4318           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4319             ENCODE_SINGLE_SHIFT_2;
4320           else
4321             ENCODE_LOCKING_SHIFT_2;
4322           break;
4323
4324         case 3:                 /* graphic register 3 */
4325           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4326             ENCODE_SINGLE_SHIFT_3;
4327           else
4328             ENCODE_LOCKING_SHIFT_3;
4329           break;
4330         }
4331     }
4332
4333   *p_nchars = produced_chars;
4334   return dst;
4335 }
4336
4337
4338 /* Produce codes for designation and invocation to reset the graphic
4339    planes and registers to initial state.  */
4340 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4341   do {                                                                  \
4342     int reg;                                                            \
4343     struct charset *charset;                                            \
4344                                                                         \
4345     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4346       ENCODE_SHIFT_IN;                                                  \
4347     for (reg = 0; reg < 4; reg++)                                       \
4348       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4349           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4350               != CODING_ISO_INITIAL (coding, reg)))                     \
4351         {                                                               \
4352           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4353           ENCODE_DESIGNATION (charset, reg, coding);                    \
4354         }                                                               \
4355   } while (0)
4356
4357
4358 /* Produce designation sequences of charsets in the line started from
4359    CHARBUF to a place pointed by DST, and return the number of
4360    produced bytes.  DST should not directly point a buffer text area
4361    which may be relocated by char_charset call.
4362
4363    If the current block ends before any end-of-line, we may fail to
4364    find all the necessary designations.  */
4365
4366 static ptrdiff_t
4367 encode_designation_at_bol (struct coding_system *coding,
4368                            int *charbuf, int *charbuf_end,
4369                            unsigned char *dst)
4370 {
4371   unsigned char *orig = dst;
4372   struct charset *charset;
4373   /* Table of charsets to be designated to each graphic register.  */
4374   int r[4];
4375   int c, found = 0, reg;
4376   ptrdiff_t produced_chars = 0;
4377   bool multibytep = coding->dst_multibyte;
4378   Lisp_Object attrs;
4379   Lisp_Object charset_list;
4380
4381   attrs = CODING_ID_ATTRS (coding->id);
4382   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4383   if (EQ (charset_list, Qiso_2022))
4384     charset_list = Viso_2022_charset_list;
4385
4386   for (reg = 0; reg < 4; reg++)
4387     r[reg] = -1;
4388
4389   while (charbuf < charbuf_end && found < 4)
4390     {
4391       int id;
4392
4393       c = *charbuf++;
4394       if (c == '\n')
4395         break;
4396       charset = char_charset (c, charset_list, NULL);
4397       id = CHARSET_ID (charset);
4398       reg = CODING_ISO_REQUEST (coding, id);
4399       if (reg >= 0 && r[reg] < 0)
4400         {
4401           found++;
4402           r[reg] = id;
4403         }
4404     }
4405
4406   if (found)
4407     {
4408       for (reg = 0; reg < 4; reg++)
4409         if (r[reg] >= 0
4410             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4411           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4412     }
4413
4414   return dst - orig;
4415 }
4416
4417 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4418
4419 static bool
4420 encode_coding_iso_2022 (struct coding_system *coding)
4421 {
4422   bool multibytep = coding->dst_multibyte;
4423   int *charbuf = coding->charbuf;
4424   int *charbuf_end = charbuf + coding->charbuf_used;
4425   unsigned char *dst = coding->destination + coding->produced;
4426   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4427   int safe_room = 16;
4428   bool bol_designation
4429     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4430        && CODING_ISO_BOL (coding));
4431   ptrdiff_t produced_chars = 0;
4432   Lisp_Object attrs, eol_type, charset_list;
4433   bool ascii_compatible;
4434   int c;
4435   int preferred_charset_id = -1;
4436
4437   CODING_GET_INFO (coding, attrs, charset_list);
4438   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4439   if (VECTORP (eol_type))
4440     eol_type = Qunix;
4441
4442   setup_iso_safe_charsets (attrs);
4443   /* Charset list may have been changed.  */
4444   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4445   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4446
4447   ascii_compatible
4448     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4449        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4450                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4451
4452   while (charbuf < charbuf_end)
4453     {
4454       ASSURE_DESTINATION (safe_room);
4455
4456       if (bol_designation)
4457         {
4458           /* We have to produce designation sequences if any now.  */
4459           unsigned char desig_buf[16];
4460           ptrdiff_t nbytes;
4461           ptrdiff_t offset;
4462
4463           charset_map_loaded = 0;
4464           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4465                                               desig_buf);
4466           if (charset_map_loaded
4467               && (offset = coding_change_destination (coding)))
4468             {
4469               dst += offset;
4470               dst_end += offset;
4471             }
4472           memcpy (dst, desig_buf, nbytes);
4473           dst += nbytes;
4474           /* We are sure that designation sequences are all ASCII bytes.  */
4475           produced_chars += nbytes;
4476           bol_designation = 0;
4477           ASSURE_DESTINATION (safe_room);
4478         }
4479
4480       c = *charbuf++;
4481
4482       if (c < 0)
4483         {
4484           /* Handle an annotation.  */
4485           switch (*charbuf)
4486             {
4487             case CODING_ANNOTATE_COMPOSITION_MASK:
4488               /* Not yet implemented.  */
4489               break;
4490             case CODING_ANNOTATE_CHARSET_MASK:
4491               preferred_charset_id = charbuf[2];
4492               if (preferred_charset_id >= 0
4493                   && NILP (Fmemq (make_number (preferred_charset_id),
4494                                   charset_list)))
4495                 preferred_charset_id = -1;
4496               break;
4497             default:
4498               emacs_abort ();
4499             }
4500           charbuf += -c - 1;
4501           continue;
4502         }
4503
4504       /* Now encode the character C.  */
4505       if (c < 0x20 || c == 0x7F)
4506         {
4507           if (c == '\n'
4508               || (c == '\r' && EQ (eol_type, Qmac)))
4509             {
4510               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4511                 ENCODE_RESET_PLANE_AND_REGISTER ();
4512               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4513                 {
4514                   int i;
4515
4516                   for (i = 0; i < 4; i++)
4517                     CODING_ISO_DESIGNATION (coding, i)
4518                       = CODING_ISO_INITIAL (coding, i);
4519                 }
4520               bol_designation = ((CODING_ISO_FLAGS (coding)
4521                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4522                                  != 0);
4523             }
4524           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4525             ENCODE_RESET_PLANE_AND_REGISTER ();
4526           EMIT_ONE_ASCII_BYTE (c);
4527         }
4528       else if (ASCII_CHAR_P (c))
4529         {
4530           if (ascii_compatible)
4531             EMIT_ONE_ASCII_BYTE (c);
4532           else
4533             {
4534               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4535               ENCODE_ISO_CHARACTER (charset, c);
4536             }
4537         }
4538       else if (CHAR_BYTE8_P (c))
4539         {
4540           c = CHAR_TO_BYTE8 (c);
4541           EMIT_ONE_BYTE (c);
4542         }
4543       else
4544         {
4545           struct charset *charset;
4546
4547           if (preferred_charset_id >= 0)
4548             {
4549               bool result;
4550
4551               charset = CHARSET_FROM_ID (preferred_charset_id);
4552               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4553               if (! result)
4554                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4555                                      NULL, charset);
4556             }
4557           else
4558             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4559                                  NULL, charset);
4560           if (!charset)
4561             {
4562               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4563                 {
4564                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4565                   charset = CHARSET_FROM_ID (charset_ascii);
4566                 }
4567               else
4568                 {
4569                   c = coding->default_char;
4570                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4571                                        charset_list, NULL, charset);
4572                 }
4573             }
4574           ENCODE_ISO_CHARACTER (charset, c);
4575         }
4576     }
4577
4578   if (coding->mode & CODING_MODE_LAST_BLOCK
4579       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4580     {
4581       ASSURE_DESTINATION (safe_room);
4582       ENCODE_RESET_PLANE_AND_REGISTER ();
4583     }
4584   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4585   CODING_ISO_BOL (coding) = bol_designation;
4586   coding->produced_char += produced_chars;
4587   coding->produced = dst - coding->destination;
4588   return 0;
4589 }
4590
4591 \f
4592 /*** 8,9. SJIS and BIG5 handlers ***/
4593
4594 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4595    quite widely.  So, for the moment, Emacs supports them in the bare
4596    C code.  But, in the future, they may be supported only by CCL.  */
4597
4598 /* SJIS is a coding system encoding three character sets: ASCII, right
4599    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4600    as is.  A character of charset katakana-jisx0201 is encoded by
4601    "position-code + 0x80".  A character of charset japanese-jisx0208
4602    is encoded in 2-byte but two position-codes are divided and shifted
4603    so that it fit in the range below.
4604
4605    --- CODE RANGE of SJIS ---
4606    (character set)      (range)
4607    ASCII                0x00 .. 0x7F
4608    KATAKANA-JISX0201    0xA0 .. 0xDF
4609    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4610             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4611    -------------------------------
4612
4613 */
4614
4615 /* BIG5 is a coding system encoding two character sets: ASCII and
4616    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4617    character set and is encoded in two-byte.
4618
4619    --- CODE RANGE of BIG5 ---
4620    (character set)      (range)
4621    ASCII                0x00 .. 0x7F
4622    Big5 (1st byte)      0xA1 .. 0xFE
4623         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4624    --------------------------
4625
4626   */
4627
4628 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4629    Return true if a text is encoded in SJIS.  */
4630
4631 static bool
4632 detect_coding_sjis (struct coding_system *coding,
4633                     struct coding_detection_info *detect_info)
4634 {
4635   const unsigned char *src = coding->source, *src_base;
4636   const unsigned char *src_end = coding->source + coding->src_bytes;
4637   bool multibytep = coding->src_multibyte;
4638   ptrdiff_t consumed_chars = 0;
4639   int found = 0;
4640   int c;
4641   Lisp_Object attrs, charset_list;
4642   int max_first_byte_of_2_byte_code;
4643
4644   CODING_GET_INFO (coding, attrs, charset_list);
4645   max_first_byte_of_2_byte_code
4646     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4647
4648   detect_info->checked |= CATEGORY_MASK_SJIS;
4649   /* A coding system of this category is always ASCII compatible.  */
4650   src += coding->head_ascii;
4651
4652   while (1)
4653     {
4654       src_base = src;
4655       ONE_MORE_BYTE (c);
4656       if (c < 0x80)
4657         continue;
4658       if ((c >= 0x81 && c <= 0x9F)
4659           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4660         {
4661           ONE_MORE_BYTE (c);
4662           if (c < 0x40 || c == 0x7F || c > 0xFC)
4663             break;
4664           found = CATEGORY_MASK_SJIS;
4665         }
4666       else if (c >= 0xA0 && c < 0xE0)
4667         found = CATEGORY_MASK_SJIS;
4668       else
4669         break;
4670     }
4671   detect_info->rejected |= CATEGORY_MASK_SJIS;
4672   return 0;
4673
4674  no_more_source:
4675   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4676     {
4677       detect_info->rejected |= CATEGORY_MASK_SJIS;
4678       return 0;
4679     }
4680   detect_info->found |= found;
4681   return 1;
4682 }
4683
4684 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4685    Return true if a text is encoded in BIG5.  */
4686
4687 static bool
4688 detect_coding_big5 (struct coding_system *coding,
4689                     struct coding_detection_info *detect_info)
4690 {
4691   const unsigned char *src = coding->source, *src_base;
4692   const unsigned char *src_end = coding->source + coding->src_bytes;
4693   bool multibytep = coding->src_multibyte;
4694   ptrdiff_t consumed_chars = 0;
4695   int found = 0;
4696   int c;
4697
4698   detect_info->checked |= CATEGORY_MASK_BIG5;
4699   /* A coding system of this category is always ASCII compatible.  */
4700   src += coding->head_ascii;
4701
4702   while (1)
4703     {
4704       src_base = src;
4705       ONE_MORE_BYTE (c);
4706       if (c < 0x80)
4707         continue;
4708       if (c >= 0xA1)
4709         {
4710           ONE_MORE_BYTE (c);
4711           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4712             return 0;
4713           found = CATEGORY_MASK_BIG5;
4714         }
4715       else
4716         break;
4717     }
4718   detect_info->rejected |= CATEGORY_MASK_BIG5;
4719   return 0;
4720
4721  no_more_source:
4722   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4723     {
4724       detect_info->rejected |= CATEGORY_MASK_BIG5;
4725       return 0;
4726     }
4727   detect_info->found |= found;
4728   return 1;
4729 }
4730
4731 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4732
4733 static void
4734 decode_coding_sjis (struct coding_system *coding)
4735 {
4736   const unsigned char *src = coding->source + coding->consumed;
4737   const unsigned char *src_end = coding->source + coding->src_bytes;
4738   const unsigned char *src_base;
4739   int *charbuf = coding->charbuf + coding->charbuf_used;
4740   /* We may produce one charset annotation in one loop and one more at
4741      the end.  */
4742   int *charbuf_end
4743     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4744   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4745   bool multibytep = coding->src_multibyte;
4746   struct charset *charset_roman, *charset_kanji, *charset_kana;
4747   struct charset *charset_kanji2;
4748   Lisp_Object attrs, charset_list, val;
4749   ptrdiff_t char_offset = coding->produced_char;
4750   ptrdiff_t last_offset = char_offset;
4751   int last_id = charset_ascii;
4752   bool eol_dos
4753     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4754   int byte_after_cr = -1;
4755
4756   CODING_GET_INFO (coding, attrs, charset_list);
4757
4758   val = charset_list;
4759   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4760   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4761   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4762   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4763
4764   while (1)
4765     {
4766       int c, c1;
4767       struct charset *charset;
4768
4769       src_base = src;
4770       consumed_chars_base = consumed_chars;
4771
4772       if (charbuf >= charbuf_end)
4773         {
4774           if (byte_after_cr >= 0)
4775             src_base--;
4776           break;
4777         }
4778
4779       if (byte_after_cr >= 0)
4780         c = byte_after_cr, byte_after_cr = -1;
4781       else
4782         ONE_MORE_BYTE (c);
4783       if (c < 0)
4784         goto invalid_code;
4785       if (c < 0x80)
4786         {
4787           if (eol_dos && c == '\r')
4788             ONE_MORE_BYTE (byte_after_cr);
4789           charset = charset_roman;
4790         }
4791       else if (c == 0x80 || c == 0xA0)
4792         goto invalid_code;
4793       else if (c >= 0xA1 && c <= 0xDF)
4794         {
4795           /* SJIS -> JISX0201-Kana */
4796           c &= 0x7F;
4797           charset = charset_kana;
4798         }
4799       else if (c <= 0xEF)
4800         {
4801           /* SJIS -> JISX0208 */
4802           ONE_MORE_BYTE (c1);
4803           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4804             goto invalid_code;
4805           c = (c << 8) | c1;
4806           SJIS_TO_JIS (c);
4807           charset = charset_kanji;
4808         }
4809       else if (c <= 0xFC && charset_kanji2)
4810         {
4811           /* SJIS -> JISX0213-2 */
4812           ONE_MORE_BYTE (c1);
4813           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4814             goto invalid_code;
4815           c = (c << 8) | c1;
4816           SJIS_TO_JIS2 (c);
4817           charset = charset_kanji2;
4818         }
4819       else
4820         goto invalid_code;
4821       if (charset->id != charset_ascii
4822           && last_id != charset->id)
4823         {
4824           if (last_id != charset_ascii)
4825             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4826           last_id = charset->id;
4827           last_offset = char_offset;
4828         }
4829       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4830       *charbuf++ = c;
4831       char_offset++;
4832       continue;
4833
4834     invalid_code:
4835       src = src_base;
4836       consumed_chars = consumed_chars_base;
4837       ONE_MORE_BYTE (c);
4838       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4839       char_offset++;
4840       coding->errors++;
4841     }
4842
4843  no_more_source:
4844   if (last_id != charset_ascii)
4845     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4846   coding->consumed_char += consumed_chars_base;
4847   coding->consumed = src_base - coding->source;
4848   coding->charbuf_used = charbuf - coding->charbuf;
4849 }
4850
4851 static void
4852 decode_coding_big5 (struct coding_system *coding)
4853 {
4854   const unsigned char *src = coding->source + coding->consumed;
4855   const unsigned char *src_end = coding->source + coding->src_bytes;
4856   const unsigned char *src_base;
4857   int *charbuf = coding->charbuf + coding->charbuf_used;
4858   /* We may produce one charset annotation in one loop and one more at
4859      the end.  */
4860   int *charbuf_end
4861     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4862   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4863   bool multibytep = coding->src_multibyte;
4864   struct charset *charset_roman, *charset_big5;
4865   Lisp_Object attrs, charset_list, val;
4866   ptrdiff_t char_offset = coding->produced_char;
4867   ptrdiff_t last_offset = char_offset;
4868   int last_id = charset_ascii;
4869   bool eol_dos
4870     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4871   int byte_after_cr = -1;
4872
4873   CODING_GET_INFO (coding, attrs, charset_list);
4874   val = charset_list;
4875   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4876   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4877
4878   while (1)
4879     {
4880       int c, c1;
4881       struct charset *charset;
4882
4883       src_base = src;
4884       consumed_chars_base = consumed_chars;
4885
4886       if (charbuf >= charbuf_end)
4887         {
4888           if (byte_after_cr >= 0)
4889             src_base--;
4890           break;
4891         }
4892
4893       if (byte_after_cr >= 0)
4894         c = byte_after_cr, byte_after_cr = -1;
4895       else
4896         ONE_MORE_BYTE (c);
4897
4898       if (c < 0)
4899         goto invalid_code;
4900       if (c < 0x80)
4901         {
4902           if (eol_dos && c == '\r')
4903             ONE_MORE_BYTE (byte_after_cr);
4904           charset = charset_roman;
4905         }
4906       else
4907         {
4908           /* BIG5 -> Big5 */
4909           if (c < 0xA1 || c > 0xFE)
4910             goto invalid_code;
4911           ONE_MORE_BYTE (c1);
4912           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4913             goto invalid_code;
4914           c = c << 8 | c1;
4915           charset = charset_big5;
4916         }
4917       if (charset->id != charset_ascii
4918           && last_id != charset->id)
4919         {
4920           if (last_id != charset_ascii)
4921             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4922           last_id = charset->id;
4923           last_offset = char_offset;
4924         }
4925       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4926       *charbuf++ = c;
4927       char_offset++;
4928       continue;
4929
4930     invalid_code:
4931       src = src_base;
4932       consumed_chars = consumed_chars_base;
4933       ONE_MORE_BYTE (c);
4934       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4935       char_offset++;
4936       coding->errors++;
4937     }
4938
4939  no_more_source:
4940   if (last_id != charset_ascii)
4941     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4942   coding->consumed_char += consumed_chars_base;
4943   coding->consumed = src_base - coding->source;
4944   coding->charbuf_used = charbuf - coding->charbuf;
4945 }
4946
4947 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4948    This function can encode charsets `ascii', `katakana-jisx0201',
4949    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4950    are sure that all these charsets are registered as official charset
4951    (i.e. do not have extended leading-codes).  Characters of other
4952    charsets are produced without any encoding.  */
4953
4954 static bool
4955 encode_coding_sjis (struct coding_system *coding)
4956 {
4957   bool multibytep = coding->dst_multibyte;
4958   int *charbuf = coding->charbuf;
4959   int *charbuf_end = charbuf + coding->charbuf_used;
4960   unsigned char *dst = coding->destination + coding->produced;
4961   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4962   int safe_room = 4;
4963   ptrdiff_t produced_chars = 0;
4964   Lisp_Object attrs, charset_list, val;
4965   bool ascii_compatible;
4966   struct charset *charset_kanji, *charset_kana;
4967   struct charset *charset_kanji2;
4968   int c;
4969
4970   CODING_GET_INFO (coding, attrs, charset_list);
4971   val = XCDR (charset_list);
4972   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4973   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4974   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4975
4976   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4977
4978   while (charbuf < charbuf_end)
4979     {
4980       ASSURE_DESTINATION (safe_room);
4981       c = *charbuf++;
4982       /* Now encode the character C.  */
4983       if (ASCII_CHAR_P (c) && ascii_compatible)
4984         EMIT_ONE_ASCII_BYTE (c);
4985       else if (CHAR_BYTE8_P (c))
4986         {
4987           c = CHAR_TO_BYTE8 (c);
4988           EMIT_ONE_BYTE (c);
4989         }
4990       else
4991         {
4992           unsigned code;
4993           struct charset *charset;
4994           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4995                                &code, charset);
4996
4997           if (!charset)
4998             {
4999               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5000                 {
5001                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5002                   charset = CHARSET_FROM_ID (charset_ascii);
5003                 }
5004               else
5005                 {
5006                   c = coding->default_char;
5007                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5008                                        charset_list, &code, charset);
5009                 }
5010             }
5011           if (code == CHARSET_INVALID_CODE (charset))
5012             emacs_abort ();
5013           if (charset == charset_kanji)
5014             {
5015               int c1, c2;
5016               JIS_TO_SJIS (code);
5017               c1 = code >> 8, c2 = code & 0xFF;
5018               EMIT_TWO_BYTES (c1, c2);
5019             }
5020           else if (charset == charset_kana)
5021             EMIT_ONE_BYTE (code | 0x80);
5022           else if (charset_kanji2 && charset == charset_kanji2)
5023             {
5024               int c1, c2;
5025
5026               c1 = code >> 8;
5027               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5028                   || c1 == 0x28
5029                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5030                 {
5031                   JIS_TO_SJIS2 (code);
5032                   c1 = code >> 8, c2 = code & 0xFF;
5033                   EMIT_TWO_BYTES (c1, c2);
5034                 }
5035               else
5036                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5037             }
5038           else
5039             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5040         }
5041     }
5042   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5043   coding->produced_char += produced_chars;
5044   coding->produced = dst - coding->destination;
5045   return 0;
5046 }
5047
5048 static bool
5049 encode_coding_big5 (struct coding_system *coding)
5050 {
5051   bool multibytep = coding->dst_multibyte;
5052   int *charbuf = coding->charbuf;
5053   int *charbuf_end = charbuf + coding->charbuf_used;
5054   unsigned char *dst = coding->destination + coding->produced;
5055   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5056   int safe_room = 4;
5057   ptrdiff_t produced_chars = 0;
5058   Lisp_Object attrs, charset_list, val;
5059   bool ascii_compatible;
5060   struct charset *charset_big5;
5061   int c;
5062
5063   CODING_GET_INFO (coding, attrs, charset_list);
5064   val = XCDR (charset_list);
5065   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5066   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5067
5068   while (charbuf < charbuf_end)
5069     {
5070       ASSURE_DESTINATION (safe_room);
5071       c = *charbuf++;
5072       /* Now encode the character C.  */
5073       if (ASCII_CHAR_P (c) && ascii_compatible)
5074         EMIT_ONE_ASCII_BYTE (c);
5075       else if (CHAR_BYTE8_P (c))
5076         {
5077           c = CHAR_TO_BYTE8 (c);
5078           EMIT_ONE_BYTE (c);
5079         }
5080       else
5081         {
5082           unsigned code;
5083           struct charset *charset;
5084           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5085                                &code, charset);
5086
5087           if (! charset)
5088             {
5089               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5090                 {
5091                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5092                   charset = CHARSET_FROM_ID (charset_ascii);
5093                 }
5094               else
5095                 {
5096                   c = coding->default_char;
5097                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5098                                        charset_list, &code, charset);
5099                 }
5100             }
5101           if (code == CHARSET_INVALID_CODE (charset))
5102             emacs_abort ();
5103           if (charset == charset_big5)
5104             {
5105               int c1, c2;
5106
5107               c1 = code >> 8, c2 = code & 0xFF;
5108               EMIT_TWO_BYTES (c1, c2);
5109             }
5110           else
5111             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5112         }
5113     }
5114   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5115   coding->produced_char += produced_chars;
5116   coding->produced = dst - coding->destination;
5117   return 0;
5118 }
5119
5120 \f
5121 /*** 10. CCL handlers ***/
5122
5123 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5124    Return true if a text is encoded in a coding system of which
5125    encoder/decoder are written in CCL program.  */
5126
5127 static bool
5128 detect_coding_ccl (struct coding_system *coding,
5129                    struct coding_detection_info *detect_info)
5130 {
5131   const unsigned char *src = coding->source, *src_base;
5132   const unsigned char *src_end = coding->source + coding->src_bytes;
5133   bool multibytep = coding->src_multibyte;
5134   ptrdiff_t consumed_chars = 0;
5135   int found = 0;
5136   unsigned char *valids;
5137   ptrdiff_t head_ascii = coding->head_ascii;
5138   Lisp_Object attrs;
5139
5140   detect_info->checked |= CATEGORY_MASK_CCL;
5141
5142   coding = &coding_categories[coding_category_ccl];
5143   valids = CODING_CCL_VALIDS (coding);
5144   attrs = CODING_ID_ATTRS (coding->id);
5145   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5146     src += head_ascii;
5147
5148   while (1)
5149     {
5150       int c;
5151
5152       src_base = src;
5153       ONE_MORE_BYTE (c);
5154       if (c < 0 || ! valids[c])
5155         break;
5156       if ((valids[c] > 1))
5157         found = CATEGORY_MASK_CCL;
5158     }
5159   detect_info->rejected |= CATEGORY_MASK_CCL;
5160   return 0;
5161
5162  no_more_source:
5163   detect_info->found |= found;
5164   return 1;
5165 }
5166
5167 static void
5168 decode_coding_ccl (struct coding_system *coding)
5169 {
5170   const unsigned char *src = coding->source + coding->consumed;
5171   const unsigned char *src_end = coding->source + coding->src_bytes;
5172   int *charbuf = coding->charbuf + coding->charbuf_used;
5173   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5174   ptrdiff_t consumed_chars = 0;
5175   bool multibytep = coding->src_multibyte;
5176   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5177   int source_charbuf[1024];
5178   int source_byteidx[1025];
5179   Lisp_Object attrs, charset_list;
5180
5181   CODING_GET_INFO (coding, attrs, charset_list);
5182
5183   while (1)
5184     {
5185       const unsigned char *p = src;
5186       ptrdiff_t offset;
5187       int i = 0;
5188
5189       if (multibytep)
5190         {
5191           while (i < 1024 && p < src_end)
5192             {
5193               source_byteidx[i] = p - src;
5194               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5195             }
5196           source_byteidx[i] = p - src;
5197         }
5198       else
5199         while (i < 1024 && p < src_end)
5200           source_charbuf[i++] = *p++;
5201
5202       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5203         ccl->last_block = true;
5204       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5205       charset_map_loaded = 0;
5206       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5207                   charset_list);
5208       if (charset_map_loaded
5209           && (offset = coding_change_source (coding)))
5210         {
5211           p += offset;
5212           src += offset;
5213           src_end += offset;
5214         }
5215       charbuf += ccl->produced;
5216       if (multibytep)
5217         src += source_byteidx[ccl->consumed];
5218       else
5219         src += ccl->consumed;
5220       consumed_chars += ccl->consumed;
5221       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5222         break;
5223     }
5224
5225   switch (ccl->status)
5226     {
5227     case CCL_STAT_SUSPEND_BY_SRC:
5228       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5229       break;
5230     case CCL_STAT_SUSPEND_BY_DST:
5231       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5232       break;
5233     case CCL_STAT_QUIT:
5234     case CCL_STAT_INVALID_CMD:
5235       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5236       break;
5237     default:
5238       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5239       break;
5240     }
5241   coding->consumed_char += consumed_chars;
5242   coding->consumed = src - coding->source;
5243   coding->charbuf_used = charbuf - coding->charbuf;
5244 }
5245
5246 static bool
5247 encode_coding_ccl (struct coding_system *coding)
5248 {
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   bool multibytep = coding->dst_multibyte;
5251   int *charbuf = coding->charbuf;
5252   int *charbuf_end = charbuf + coding->charbuf_used;
5253   unsigned char *dst = coding->destination + coding->produced;
5254   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5255   int destination_charbuf[1024];
5256   ptrdiff_t produced_chars = 0;
5257   int i;
5258   Lisp_Object attrs, charset_list;
5259
5260   CODING_GET_INFO (coding, attrs, charset_list);
5261   if (coding->consumed_char == coding->src_chars
5262       && coding->mode & CODING_MODE_LAST_BLOCK)
5263     ccl->last_block = true;
5264
5265   do
5266     {
5267       ptrdiff_t offset;
5268
5269       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5270       charset_map_loaded = 0;
5271       ccl_driver (ccl, charbuf, destination_charbuf,
5272                   charbuf_end - charbuf, 1024, charset_list);
5273       if (charset_map_loaded
5274           && (offset = coding_change_destination (coding)))
5275         dst += offset;
5276       if (multibytep)
5277         {
5278           ASSURE_DESTINATION (ccl->produced * 2);
5279           for (i = 0; i < ccl->produced; i++)
5280             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5281         }
5282       else
5283         {
5284           ASSURE_DESTINATION (ccl->produced);
5285           for (i = 0; i < ccl->produced; i++)
5286             *dst++ = destination_charbuf[i] & 0xFF;
5287           produced_chars += ccl->produced;
5288         }
5289       charbuf += ccl->consumed;
5290       if (ccl->status == CCL_STAT_QUIT
5291           || ccl->status == CCL_STAT_INVALID_CMD)
5292         break;
5293     }
5294   while (charbuf < charbuf_end);
5295
5296   switch (ccl->status)
5297     {
5298     case CCL_STAT_SUSPEND_BY_SRC:
5299       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5300       break;
5301     case CCL_STAT_SUSPEND_BY_DST:
5302       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5303       break;
5304     case CCL_STAT_QUIT:
5305     case CCL_STAT_INVALID_CMD:
5306       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5307       break;
5308     default:
5309       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5310       break;
5311     }
5312
5313   coding->produced_char += produced_chars;
5314   coding->produced = dst - coding->destination;
5315   return 0;
5316 }
5317
5318 \f
5319 /*** 10, 11. no-conversion handlers ***/
5320
5321 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5322
5323 static void
5324 decode_coding_raw_text (struct coding_system *coding)
5325 {
5326   bool eol_dos
5327     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5328
5329   coding->chars_at_source = 1;
5330   coding->consumed_char = coding->src_chars;
5331   coding->consumed = coding->src_bytes;
5332   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5333     {
5334       coding->consumed_char--;
5335       coding->consumed--;
5336       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5337     }
5338   else
5339     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5340 }
5341
5342 static bool
5343 encode_coding_raw_text (struct coding_system *coding)
5344 {
5345   bool multibytep = coding->dst_multibyte;
5346   int *charbuf = coding->charbuf;
5347   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5348   unsigned char *dst = coding->destination + coding->produced;
5349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5350   ptrdiff_t produced_chars = 0;
5351   int c;
5352
5353   if (multibytep)
5354     {
5355       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5356
5357       if (coding->src_multibyte)
5358         while (charbuf < charbuf_end)
5359           {
5360             ASSURE_DESTINATION (safe_room);
5361             c = *charbuf++;
5362             if (ASCII_CHAR_P (c))
5363               EMIT_ONE_ASCII_BYTE (c);
5364             else if (CHAR_BYTE8_P (c))
5365               {
5366                 c = CHAR_TO_BYTE8 (c);
5367                 EMIT_ONE_BYTE (c);
5368               }
5369             else
5370               {
5371                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5372
5373                 CHAR_STRING_ADVANCE (c, p1);
5374                 do
5375                   {
5376                     EMIT_ONE_BYTE (*p0);
5377                     p0++;
5378                   }
5379                 while (p0 < p1);
5380               }
5381           }
5382       else
5383         while (charbuf < charbuf_end)
5384           {
5385             ASSURE_DESTINATION (safe_room);
5386             c = *charbuf++;
5387             EMIT_ONE_BYTE (c);
5388           }
5389     }
5390   else
5391     {
5392       if (coding->src_multibyte)
5393         {
5394           int safe_room = MAX_MULTIBYTE_LENGTH;
5395
5396           while (charbuf < charbuf_end)
5397             {
5398               ASSURE_DESTINATION (safe_room);
5399               c = *charbuf++;
5400               if (ASCII_CHAR_P (c))
5401                 *dst++ = c;
5402               else if (CHAR_BYTE8_P (c))
5403                 *dst++ = CHAR_TO_BYTE8 (c);
5404               else
5405                 CHAR_STRING_ADVANCE (c, dst);
5406             }
5407         }
5408       else
5409         {
5410           ASSURE_DESTINATION (charbuf_end - charbuf);
5411           while (charbuf < charbuf_end && dst < dst_end)
5412             *dst++ = *charbuf++;
5413         }
5414       produced_chars = dst - (coding->destination + coding->produced);
5415     }
5416   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5417   coding->produced_char += produced_chars;
5418   coding->produced = dst - coding->destination;
5419   return 0;
5420 }
5421
5422 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5423    Return true if a text is encoded in a charset-based coding system.  */
5424
5425 static bool
5426 detect_coding_charset (struct coding_system *coding,
5427                        struct coding_detection_info *detect_info)
5428 {
5429   const unsigned char *src = coding->source, *src_base;
5430   const unsigned char *src_end = coding->source + coding->src_bytes;
5431   bool multibytep = coding->src_multibyte;
5432   ptrdiff_t consumed_chars = 0;
5433   Lisp_Object attrs, valids, name;
5434   int found = 0;
5435   ptrdiff_t head_ascii = coding->head_ascii;
5436   bool check_latin_extra = 0;
5437
5438   detect_info->checked |= CATEGORY_MASK_CHARSET;
5439
5440   coding = &coding_categories[coding_category_charset];
5441   attrs = CODING_ID_ATTRS (coding->id);
5442   valids = AREF (attrs, coding_attr_charset_valids);
5443   name = CODING_ID_NAME (coding->id);
5444   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5445                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5446       || strncmp (SSDATA (SYMBOL_NAME (name)),
5447                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5448     check_latin_extra = 1;
5449
5450   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5451     src += head_ascii;
5452
5453   while (1)
5454     {
5455       int c;
5456       Lisp_Object val;
5457       struct charset *charset;
5458       int dim, idx;
5459
5460       src_base = src;
5461       ONE_MORE_BYTE (c);
5462       if (c < 0)
5463         continue;
5464       val = AREF (valids, c);
5465       if (NILP (val))
5466         break;
5467       if (c >= 0x80)
5468         {
5469           if (c < 0xA0
5470               && check_latin_extra
5471               && (!VECTORP (Vlatin_extra_code_table)
5472                   || NILP (AREF (Vlatin_extra_code_table, c))))
5473             break;
5474           found = CATEGORY_MASK_CHARSET;
5475         }
5476       if (INTEGERP (val))
5477         {
5478           charset = CHARSET_FROM_ID (XFASTINT (val));
5479           dim = CHARSET_DIMENSION (charset);
5480           for (idx = 1; idx < dim; idx++)
5481             {
5482               if (src == src_end)
5483                 goto too_short;
5484               ONE_MORE_BYTE (c);
5485               if (c < charset->code_space[(dim - 1 - idx) * 4]
5486                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5487                 break;
5488             }
5489           if (idx < dim)
5490             break;
5491         }
5492       else
5493         {
5494           idx = 1;
5495           for (; CONSP (val); val = XCDR (val))
5496             {
5497               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5498               dim = CHARSET_DIMENSION (charset);
5499               while (idx < dim)
5500                 {
5501                   if (src == src_end)
5502                     goto too_short;
5503                   ONE_MORE_BYTE (c);
5504                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5505                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5506                     break;
5507                   idx++;
5508                 }
5509               if (idx == dim)
5510                 {
5511                   val = Qnil;
5512                   break;
5513                 }
5514             }
5515           if (CONSP (val))
5516             break;
5517         }
5518     }
5519  too_short:
5520   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5521   return 0;
5522
5523  no_more_source:
5524   detect_info->found |= found;
5525   return 1;
5526 }
5527
5528 static void
5529 decode_coding_charset (struct coding_system *coding)
5530 {
5531   const unsigned char *src = coding->source + coding->consumed;
5532   const unsigned char *src_end = coding->source + coding->src_bytes;
5533   const unsigned char *src_base;
5534   int *charbuf = coding->charbuf + coding->charbuf_used;
5535   /* We may produce one charset annotation in one loop and one more at
5536      the end.  */
5537   int *charbuf_end
5538     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5539   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5540   bool multibytep = coding->src_multibyte;
5541   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5542   Lisp_Object valids;
5543   ptrdiff_t char_offset = coding->produced_char;
5544   ptrdiff_t last_offset = char_offset;
5545   int last_id = charset_ascii;
5546   bool eol_dos
5547     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5548   int byte_after_cr = -1;
5549
5550   valids = AREF (attrs, coding_attr_charset_valids);
5551
5552   while (1)
5553     {
5554       int c;
5555       Lisp_Object val;
5556       struct charset *charset;
5557       int dim;
5558       int len = 1;
5559       unsigned code;
5560
5561       src_base = src;
5562       consumed_chars_base = consumed_chars;
5563
5564       if (charbuf >= charbuf_end)
5565         {
5566           if (byte_after_cr >= 0)
5567             src_base--;
5568           break;
5569         }
5570
5571       if (byte_after_cr >= 0)
5572         {
5573           c = byte_after_cr;
5574           byte_after_cr = -1;
5575         }
5576       else
5577         {
5578           ONE_MORE_BYTE (c);
5579           if (eol_dos && c == '\r')
5580             ONE_MORE_BYTE (byte_after_cr);
5581         }
5582       if (c < 0)
5583         goto invalid_code;
5584       code = c;
5585
5586       val = AREF (valids, c);
5587       if (! INTEGERP (val) && ! CONSP (val))
5588         goto invalid_code;
5589       if (INTEGERP (val))
5590         {
5591           charset = CHARSET_FROM_ID (XFASTINT (val));
5592           dim = CHARSET_DIMENSION (charset);
5593           while (len < dim)
5594             {
5595               ONE_MORE_BYTE (c);
5596               code = (code << 8) | c;
5597               len++;
5598             }
5599           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5600                               charset, code, c);
5601         }
5602       else
5603         {
5604           /* VAL is a list of charset IDs.  It is assured that the
5605              list is sorted by charset dimensions (smaller one
5606              comes first).  */
5607           while (CONSP (val))
5608             {
5609               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5610               dim = CHARSET_DIMENSION (charset);
5611               while (len < dim)
5612                 {
5613                   ONE_MORE_BYTE (c);
5614                   code = (code << 8) | c;
5615                   len++;
5616                 }
5617               CODING_DECODE_CHAR (coding, src, src_base,
5618                                   src_end, charset, code, c);
5619               if (c >= 0)
5620                 break;
5621               val = XCDR (val);
5622             }
5623         }
5624       if (c < 0)
5625         goto invalid_code;
5626       if (charset->id != charset_ascii
5627           && last_id != charset->id)
5628         {
5629           if (last_id != charset_ascii)
5630             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5631           last_id = charset->id;
5632           last_offset = char_offset;
5633         }
5634
5635       *charbuf++ = c;
5636       char_offset++;
5637       continue;
5638
5639     invalid_code:
5640       src = src_base;
5641       consumed_chars = consumed_chars_base;
5642       ONE_MORE_BYTE (c);
5643       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5644       char_offset++;
5645       coding->errors++;
5646     }
5647
5648  no_more_source:
5649   if (last_id != charset_ascii)
5650     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5651   coding->consumed_char += consumed_chars_base;
5652   coding->consumed = src_base - coding->source;
5653   coding->charbuf_used = charbuf - coding->charbuf;
5654 }
5655
5656 static bool
5657 encode_coding_charset (struct coding_system *coding)
5658 {
5659   bool multibytep = coding->dst_multibyte;
5660   int *charbuf = coding->charbuf;
5661   int *charbuf_end = charbuf + coding->charbuf_used;
5662   unsigned char *dst = coding->destination + coding->produced;
5663   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5664   int safe_room = MAX_MULTIBYTE_LENGTH;
5665   ptrdiff_t produced_chars = 0;
5666   Lisp_Object attrs, charset_list;
5667   bool ascii_compatible;
5668   int c;
5669
5670   CODING_GET_INFO (coding, attrs, charset_list);
5671   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5672
5673   while (charbuf < charbuf_end)
5674     {
5675       struct charset *charset;
5676       unsigned code;
5677
5678       ASSURE_DESTINATION (safe_room);
5679       c = *charbuf++;
5680       if (ascii_compatible && ASCII_CHAR_P (c))
5681         EMIT_ONE_ASCII_BYTE (c);
5682       else if (CHAR_BYTE8_P (c))
5683         {
5684           c = CHAR_TO_BYTE8 (c);
5685           EMIT_ONE_BYTE (c);
5686         }
5687       else
5688         {
5689           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5690                                &code, charset);
5691
5692           if (charset)
5693             {
5694               if (CHARSET_DIMENSION (charset) == 1)
5695                 EMIT_ONE_BYTE (code);
5696               else if (CHARSET_DIMENSION (charset) == 2)
5697                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5698               else if (CHARSET_DIMENSION (charset) == 3)
5699                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5700               else
5701                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5702                                  (code >> 8) & 0xFF, code & 0xFF);
5703             }
5704           else
5705             {
5706               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5707                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5708               else
5709                 c = coding->default_char;
5710               EMIT_ONE_BYTE (c);
5711             }
5712         }
5713     }
5714
5715   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5716   coding->produced_char += produced_chars;
5717   coding->produced = dst - coding->destination;
5718   return 0;
5719 }
5720
5721 \f
5722 /*** 7. C library functions ***/
5723
5724 /* Setup coding context CODING from information about CODING_SYSTEM.
5725    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5726    CODING_SYSTEM is invalid, signal an error.  */
5727
5728 void
5729 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5730 {
5731   Lisp_Object attrs;
5732   Lisp_Object eol_type;
5733   Lisp_Object coding_type;
5734   Lisp_Object val;
5735
5736   if (NILP (coding_system))
5737     coding_system = Qundecided;
5738
5739   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5740
5741   attrs = CODING_ID_ATTRS (coding->id);
5742   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5743
5744   coding->mode = 0;
5745   if (VECTORP (eol_type))
5746     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5747                             | CODING_REQUIRE_DETECTION_MASK);
5748   else if (! EQ (eol_type, Qunix))
5749     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5750                             | CODING_REQUIRE_ENCODING_MASK);
5751   else
5752     coding->common_flags = 0;
5753   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5754     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5755   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5756     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5757   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5758     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5759
5760   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5761   coding->max_charset_id = SCHARS (val) - 1;
5762   coding->safe_charsets = SDATA (val);
5763   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5764   coding->carryover_bytes = 0;
5765   coding->raw_destination = 0;
5766
5767   coding_type = CODING_ATTR_TYPE (attrs);
5768   if (EQ (coding_type, Qundecided))
5769     {
5770       coding->detector = NULL;
5771       coding->decoder = decode_coding_raw_text;
5772       coding->encoder = encode_coding_raw_text;
5773       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5774       coding->spec.undecided.inhibit_nbd
5775         = (encode_inhibit_flag
5776            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5777       coding->spec.undecided.inhibit_ied
5778         = (encode_inhibit_flag
5779            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5780       coding->spec.undecided.prefer_utf_8
5781         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5782     }
5783   else if (EQ (coding_type, Qiso_2022))
5784     {
5785       int i;
5786       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5787
5788       /* Invoke graphic register 0 to plane 0.  */
5789       CODING_ISO_INVOCATION (coding, 0) = 0;
5790       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5791       CODING_ISO_INVOCATION (coding, 1)
5792         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5793       /* Setup the initial status of designation.  */
5794       for (i = 0; i < 4; i++)
5795         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5796       /* Not single shifting initially.  */
5797       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5798       /* Beginning of buffer should also be regarded as bol. */
5799       CODING_ISO_BOL (coding) = 1;
5800       coding->detector = detect_coding_iso_2022;
5801       coding->decoder = decode_coding_iso_2022;
5802       coding->encoder = encode_coding_iso_2022;
5803       if (flags & CODING_ISO_FLAG_SAFE)
5804         coding->mode |= CODING_MODE_SAFE_ENCODING;
5805       coding->common_flags
5806         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5807             | CODING_REQUIRE_FLUSHING_MASK);
5808       if (flags & CODING_ISO_FLAG_COMPOSITION)
5809         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5810       if (flags & CODING_ISO_FLAG_DESIGNATION)
5811         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5812       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5813         {
5814           setup_iso_safe_charsets (attrs);
5815           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5816           coding->max_charset_id = SCHARS (val) - 1;
5817           coding->safe_charsets = SDATA (val);
5818         }
5819       CODING_ISO_FLAGS (coding) = flags;
5820       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5821       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5822       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5823       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5824     }
5825   else if (EQ (coding_type, Qcharset))
5826     {
5827       coding->detector = detect_coding_charset;
5828       coding->decoder = decode_coding_charset;
5829       coding->encoder = encode_coding_charset;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832     }
5833   else if (EQ (coding_type, Qutf_8))
5834     {
5835       val = AREF (attrs, coding_attr_utf_bom);
5836       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5837                                    : EQ (val, Qt) ? utf_with_bom
5838                                    : utf_without_bom);
5839       coding->detector = detect_coding_utf_8;
5840       coding->decoder = decode_coding_utf_8;
5841       coding->encoder = encode_coding_utf_8;
5842       coding->common_flags
5843         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5844       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5845         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5846     }
5847   else if (EQ (coding_type, Qutf_16))
5848     {
5849       val = AREF (attrs, coding_attr_utf_bom);
5850       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5851                                     : EQ (val, Qt) ? utf_with_bom
5852                                     : utf_without_bom);
5853       val = AREF (attrs, coding_attr_utf_16_endian);
5854       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5855                                        : utf_16_little_endian);
5856       CODING_UTF_16_SURROGATE (coding) = 0;
5857       coding->detector = detect_coding_utf_16;
5858       coding->decoder = decode_coding_utf_16;
5859       coding->encoder = encode_coding_utf_16;
5860       coding->common_flags
5861         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5862       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5863         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5864     }
5865   else if (EQ (coding_type, Qccl))
5866     {
5867       coding->detector = detect_coding_ccl;
5868       coding->decoder = decode_coding_ccl;
5869       coding->encoder = encode_coding_ccl;
5870       coding->common_flags
5871         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5872             | CODING_REQUIRE_FLUSHING_MASK);
5873     }
5874   else if (EQ (coding_type, Qemacs_mule))
5875     {
5876       coding->detector = detect_coding_emacs_mule;
5877       coding->decoder = decode_coding_emacs_mule;
5878       coding->encoder = encode_coding_emacs_mule;
5879       coding->common_flags
5880         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5881       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5882           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5883         {
5884           Lisp_Object tail, safe_charsets;
5885           int max_charset_id = 0;
5886
5887           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5888                tail = XCDR (tail))
5889             if (max_charset_id < XFASTINT (XCAR (tail)))
5890               max_charset_id = XFASTINT (XCAR (tail));
5891           safe_charsets = make_uninit_string (max_charset_id + 1);
5892           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5893           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5894                tail = XCDR (tail))
5895             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5896           coding->max_charset_id = max_charset_id;
5897           coding->safe_charsets = SDATA (safe_charsets);
5898         }
5899       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5900       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5901     }
5902   else if (EQ (coding_type, Qshift_jis))
5903     {
5904       coding->detector = detect_coding_sjis;
5905       coding->decoder = decode_coding_sjis;
5906       coding->encoder = encode_coding_sjis;
5907       coding->common_flags
5908         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5909     }
5910   else if (EQ (coding_type, Qbig5))
5911     {
5912       coding->detector = detect_coding_big5;
5913       coding->decoder = decode_coding_big5;
5914       coding->encoder = encode_coding_big5;
5915       coding->common_flags
5916         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5917     }
5918   else                          /* EQ (coding_type, Qraw_text) */
5919     {
5920       coding->detector = NULL;
5921       coding->decoder = decode_coding_raw_text;
5922       coding->encoder = encode_coding_raw_text;
5923       if (! EQ (eol_type, Qunix))
5924         {
5925           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5926           if (! VECTORP (eol_type))
5927             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5928         }
5929
5930     }
5931
5932   return;
5933 }
5934
5935 /* Return a list of charsets supported by CODING.  */
5936
5937 Lisp_Object
5938 coding_charset_list (struct coding_system *coding)
5939 {
5940   Lisp_Object attrs, charset_list;
5941
5942   CODING_GET_INFO (coding, attrs, charset_list);
5943   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5944     {
5945       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5946
5947       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5948         charset_list = Viso_2022_charset_list;
5949     }
5950   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5951     {
5952       charset_list = Vemacs_mule_charset_list;
5953     }
5954   return charset_list;
5955 }
5956
5957
5958 /* Return a list of charsets supported by CODING-SYSTEM.  */
5959
5960 Lisp_Object
5961 coding_system_charset_list (Lisp_Object coding_system)
5962 {
5963   ptrdiff_t id;
5964   Lisp_Object attrs, charset_list;
5965
5966   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5967   attrs = CODING_ID_ATTRS (id);
5968
5969   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5970     {
5971       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5972
5973       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5974         charset_list = Viso_2022_charset_list;
5975       else
5976         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5977     }
5978   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5979     {
5980       charset_list = Vemacs_mule_charset_list;
5981     }
5982   else
5983     {
5984       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5985     }
5986   return charset_list;
5987 }
5988
5989
5990 /* Return raw-text or one of its subsidiaries that has the same
5991    eol_type as CODING-SYSTEM.  */
5992
5993 Lisp_Object
5994 raw_text_coding_system (Lisp_Object coding_system)
5995 {
5996   Lisp_Object spec, attrs;
5997   Lisp_Object eol_type, raw_text_eol_type;
5998
5999   if (NILP (coding_system))
6000     return Qraw_text;
6001   spec = CODING_SYSTEM_SPEC (coding_system);
6002   attrs = AREF (spec, 0);
6003
6004   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6005     return coding_system;
6006
6007   eol_type = AREF (spec, 2);
6008   if (VECTORP (eol_type))
6009     return Qraw_text;
6010   spec = CODING_SYSTEM_SPEC (Qraw_text);
6011   raw_text_eol_type = AREF (spec, 2);
6012   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6013           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6014           : AREF (raw_text_eol_type, 2));
6015 }
6016
6017
6018 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6019    the subsidiary that has the same eol-spec as PARENT (if it is not
6020    nil and specifies end-of-line format) or the system's setting
6021    (system_eol_type).  */
6022
6023 Lisp_Object
6024 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6025 {
6026   Lisp_Object spec, eol_type;
6027
6028   if (NILP (coding_system))
6029     coding_system = Qraw_text;
6030   spec = CODING_SYSTEM_SPEC (coding_system);
6031   eol_type = AREF (spec, 2);
6032   if (VECTORP (eol_type))
6033     {
6034       Lisp_Object parent_eol_type;
6035
6036       if (! NILP (parent))
6037         {
6038           Lisp_Object parent_spec;
6039
6040           parent_spec = CODING_SYSTEM_SPEC (parent);
6041           parent_eol_type = AREF (parent_spec, 2);
6042           if (VECTORP (parent_eol_type))
6043             parent_eol_type = system_eol_type;
6044         }
6045       else
6046         parent_eol_type = system_eol_type;
6047       if (EQ (parent_eol_type, Qunix))
6048         coding_system = AREF (eol_type, 0);
6049       else if (EQ (parent_eol_type, Qdos))
6050         coding_system = AREF (eol_type, 1);
6051       else if (EQ (parent_eol_type, Qmac))
6052         coding_system = AREF (eol_type, 2);
6053     }
6054   return coding_system;
6055 }
6056
6057
6058 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6059    decided for writing to a process.  If not, complement them, and
6060    return a new coding system.  */
6061
6062 Lisp_Object
6063 complement_process_encoding_system (Lisp_Object coding_system)
6064 {
6065   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6066   Lisp_Object spec, attrs;
6067   int i;
6068
6069   for (i = 0; i < 3; i++)
6070     {
6071       if (i == 1)
6072         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6073       else if (i == 2)
6074         coding_system = preferred_coding_system ();
6075       spec = CODING_SYSTEM_SPEC (coding_system);
6076       if (NILP (spec))
6077         continue;
6078       attrs = AREF (spec, 0);
6079       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6080         coding_base = CODING_ATTR_BASE_NAME (attrs);
6081       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6082         eol_base = coding_system;
6083       if (! NILP (coding_base) && ! NILP (eol_base))
6084         break;
6085     }
6086
6087   if (i > 0)
6088     /* The original CODING_SYSTEM didn't specify text-conversion or
6089        eol-conversion.  Be sure that we return a fully complemented
6090        coding system.  */
6091     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6092   return coding_system;
6093 }
6094
6095
6096 /* Emacs has a mechanism to automatically detect a coding system if it
6097    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6098    it's impossible to distinguish some coding systems accurately
6099    because they use the same range of codes.  So, at first, coding
6100    systems are categorized into 7, those are:
6101
6102    o coding-category-emacs-mule
6103
6104         The category for a coding system which has the same code range
6105         as Emacs' internal format.  Assigned the coding-system (Lisp
6106         symbol) `emacs-mule' by default.
6107
6108    o coding-category-sjis
6109
6110         The category for a coding system which has the same code range
6111         as SJIS.  Assigned the coding-system (Lisp
6112         symbol) `japanese-shift-jis' by default.
6113
6114    o coding-category-iso-7
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 7-bit environment.  This doesn't use any locking
6118         shift and single shift functions.  This can encode/decode all
6119         charsets.  Assigned the coding-system (Lisp symbol)
6120         `iso-2022-7bit' by default.
6121
6122    o coding-category-iso-7-tight
6123
6124         Same as coding-category-iso-7 except that this can
6125         encode/decode only the specified charsets.
6126
6127    o coding-category-iso-8-1
6128
6129         The category for a coding system which has the same code range
6130         as ISO2022 of 8-bit environment and graphic plane 1 used only
6131         for DIMENSION1 charset.  This doesn't use any locking shift
6132         and single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `iso-latin-1' by default.
6134
6135    o coding-category-iso-8-2
6136
6137         The category for a coding system which has the same code range
6138         as ISO2022 of 8-bit environment and graphic plane 1 used only
6139         for DIMENSION2 charset.  This doesn't use any locking shift
6140         and single shift functions.  Assigned the coding-system (Lisp
6141         symbol) `japanese-iso-8bit' by default.
6142
6143    o coding-category-iso-7-else
6144
6145         The category for a coding system which has the same code range
6146         as ISO2022 of 7-bit environment but uses locking shift or
6147         single shift functions.  Assigned the coding-system (Lisp
6148         symbol) `iso-2022-7bit-lock' by default.
6149
6150    o coding-category-iso-8-else
6151
6152         The category for a coding system which has the same code range
6153         as ISO2022 of 8-bit environment but uses locking shift or
6154         single shift functions.  Assigned the coding-system (Lisp
6155         symbol) `iso-2022-8bit-ss2' by default.
6156
6157    o coding-category-big5
6158
6159         The category for a coding system which has the same code range
6160         as BIG5.  Assigned the coding-system (Lisp symbol)
6161         `cn-big5' by default.
6162
6163    o coding-category-utf-8
6164
6165         The category for a coding system which has the same code range
6166         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6167         symbol) `utf-8' by default.
6168
6169    o coding-category-utf-16-be
6170
6171         The category for a coding system in which a text has an
6172         Unicode signature (cf. Unicode Standard) in the order of BIG
6173         endian at the head.  Assigned the coding-system (Lisp symbol)
6174         `utf-16-be' by default.
6175
6176    o coding-category-utf-16-le
6177
6178         The category for a coding system in which a text has an
6179         Unicode signature (cf. Unicode Standard) in the order of
6180         LITTLE endian at the head.  Assigned the coding-system (Lisp
6181         symbol) `utf-16-le' by default.
6182
6183    o coding-category-ccl
6184
6185         The category for a coding system of which encoder/decoder is
6186         written in CCL programs.  The default value is nil, i.e., no
6187         coding system is assigned.
6188
6189    o coding-category-binary
6190
6191         The category for a coding system not categorized in any of the
6192         above.  Assigned the coding-system (Lisp symbol)
6193         `no-conversion' by default.
6194
6195    Each of them is a Lisp symbol and the value is an actual
6196    `coding-system's (this is also a Lisp symbol) assigned by a user.
6197    What Emacs does actually is to detect a category of coding system.
6198    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6199    decide only one possible category, it selects a category of the
6200    highest priority.  Priorities of categories are also specified by a
6201    user in a Lisp variable `coding-category-list'.
6202
6203 */
6204
6205 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6206                                            int eol_seen);
6207
6208
6209 /* Return the number of ASCII characters at the head of the source.
6210    By side effects, set coding->head_ascii and update
6211    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6212    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6213    reliable only when all the source bytes are ASCII.  */
6214
6215 static ptrdiff_t
6216 check_ascii (struct coding_system *coding)
6217 {
6218   const unsigned char *src, *end;
6219   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6220   int eol_seen = coding->eol_seen;
6221
6222   coding_set_source (coding);
6223   src = coding->source;
6224   end = src + coding->src_bytes;
6225
6226   if (inhibit_eol_conversion
6227       || SYMBOLP (eol_type))
6228     {
6229       /* We don't have to check EOL format.  */
6230       while (src < end && !( *src & 0x80))
6231         {
6232           if (*src++ == '\n')
6233             eol_seen |= EOL_SEEN_LF;
6234         }
6235     }
6236   else
6237     {
6238       end--;                /* We look ahead one byte for "CR LF".  */
6239       while (src < end)
6240         {
6241           int c = *src;
6242
6243           if (c & 0x80)
6244             break;
6245           src++;
6246           if (c == '\r')
6247             {
6248               if (*src == '\n')
6249                 {
6250                   eol_seen |= EOL_SEEN_CRLF;
6251                   src++;
6252                 }
6253               else
6254                 eol_seen |= EOL_SEEN_CR;
6255             }
6256           else if (c == '\n')
6257             eol_seen |= EOL_SEEN_LF;
6258         }
6259       if (src == end)
6260         {
6261           int c = *src;
6262
6263           /* All bytes but the last one C are ASCII.  */
6264           if (! (c & 0x80))
6265             {
6266               if (c == '\r')
6267                 eol_seen |= EOL_SEEN_CR;
6268               else if (c  == '\n')
6269                 eol_seen |= EOL_SEEN_LF;
6270               src++;
6271             }
6272         }
6273     }
6274   coding->head_ascii = src - coding->source;
6275   coding->eol_seen = eol_seen;
6276   return (coding->head_ascii);
6277 }
6278
6279
6280 /* Return the number of characters at the source if all the bytes are
6281    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6282    effects, update coding->eol_seen.  The value of coding->eol_seen is
6283    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6284    the value is reliable only when all the source bytes are valid
6285    UTF-8.  */
6286
6287 static ptrdiff_t
6288 check_utf_8 (struct coding_system *coding)
6289 {
6290   const unsigned char *src, *end;
6291   int eol_seen;
6292   ptrdiff_t nchars = coding->head_ascii;
6293
6294   if (coding->head_ascii < 0)
6295     check_ascii (coding);
6296   else
6297     coding_set_source (coding);
6298   src = coding->source + coding->head_ascii;
6299   /* We look ahead one byte for CR LF.  */
6300   end = coding->source + coding->src_bytes - 1;
6301   eol_seen = coding->eol_seen;
6302   while (src < end)
6303     {
6304       int c = *src;
6305
6306       if (UTF_8_1_OCTET_P (*src))
6307         {
6308           src++;
6309           if (c < 0x20)
6310             {
6311               if (c == '\r')
6312                 {
6313                   if (*src == '\n')
6314                     {
6315                       eol_seen |= EOL_SEEN_CRLF;
6316                       src++;
6317                       nchars++;
6318                     }
6319                   else
6320                     eol_seen |= EOL_SEEN_CR;
6321                 }
6322               else if (c == '\n')
6323                 eol_seen |= EOL_SEEN_LF;
6324             }
6325         }
6326       else if (UTF_8_2_OCTET_LEADING_P (c))
6327         {
6328           if (c < 0xC2          /* overlong sequence */
6329               || src + 1 >= end
6330               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6331             return -1;
6332           src += 2;
6333         }
6334       else if (UTF_8_3_OCTET_LEADING_P (c))
6335         {
6336           if (src + 2 >= end
6337               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6338                     && UTF_8_EXTRA_OCTET_P (src[2])))
6339             return -1;
6340           c = (((c & 0xF) << 12)
6341                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6342           if (c < 0x800                       /* overlong sequence */
6343               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6344             return -1;
6345           src += 3;
6346         }
6347       else if (UTF_8_4_OCTET_LEADING_P (c))
6348         {
6349           if (src + 3 >= end
6350               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6351                     && UTF_8_EXTRA_OCTET_P (src[2])
6352                     && UTF_8_EXTRA_OCTET_P (src[3])))
6353             return -1;
6354           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6355                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6356           if (c < 0x10000       /* overlong sequence */
6357               || c >= 0x110000) /* non-Unicode character  */
6358             return -1;
6359           src += 4;
6360         }
6361       else
6362         return -1;
6363       nchars++;
6364     }
6365
6366   if (src == end)
6367     {
6368       if (! UTF_8_1_OCTET_P (*src))
6369         return -1;
6370       nchars++;
6371       if (*src == '\r')
6372         eol_seen |= EOL_SEEN_CR;
6373       else if (*src  == '\n')
6374         eol_seen |= EOL_SEEN_LF;
6375     }
6376   coding->eol_seen = eol_seen;
6377   return nchars;
6378 }
6379
6380
6381 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6382    SOURCE is encoded.  If CATEGORY is one of
6383    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6384    two-byte, else they are encoded by one-byte.
6385
6386    Return one of EOL_SEEN_XXX.  */
6387
6388 #define MAX_EOL_CHECK_COUNT 3
6389
6390 static int
6391 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6392             enum coding_category category)
6393 {
6394   const unsigned char *src = source, *src_end = src + src_bytes;
6395   unsigned char c;
6396   int total  = 0;
6397   int eol_seen = EOL_SEEN_NONE;
6398
6399   if ((1 << category) & CATEGORY_MASK_UTF_16)
6400     {
6401       bool msb = category == (coding_category_utf_16_le
6402                               | coding_category_utf_16_le_nosig);
6403       bool lsb = !msb;
6404
6405       while (src + 1 < src_end)
6406         {
6407           c = src[lsb];
6408           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6409             {
6410               int this_eol;
6411
6412               if (c == '\n')
6413                 this_eol = EOL_SEEN_LF;
6414               else if (src + 3 >= src_end
6415                        || src[msb + 2] != 0
6416                        || src[lsb + 2] != '\n')
6417                 this_eol = EOL_SEEN_CR;
6418               else
6419                 {
6420                   this_eol = EOL_SEEN_CRLF;
6421                   src += 2;
6422                 }
6423
6424               if (eol_seen == EOL_SEEN_NONE)
6425                 /* This is the first end-of-line.  */
6426                 eol_seen = this_eol;
6427               else if (eol_seen != this_eol)
6428                 {
6429                   /* The found type is different from what found before.
6430                      Allow for stray ^M characters in DOS EOL files.  */
6431                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6432                       || (eol_seen == EOL_SEEN_CRLF
6433                           && this_eol == EOL_SEEN_CR))
6434                     eol_seen = EOL_SEEN_CRLF;
6435                   else
6436                     {
6437                       eol_seen = EOL_SEEN_LF;
6438                       break;
6439                     }
6440                 }
6441               if (++total == MAX_EOL_CHECK_COUNT)
6442                 break;
6443             }
6444           src += 2;
6445         }
6446     }
6447   else
6448     while (src < src_end)
6449       {
6450         c = *src++;
6451         if (c == '\n' || c == '\r')
6452           {
6453             int this_eol;
6454
6455             if (c == '\n')
6456               this_eol = EOL_SEEN_LF;
6457             else if (src >= src_end || *src != '\n')
6458               this_eol = EOL_SEEN_CR;
6459             else
6460               this_eol = EOL_SEEN_CRLF, src++;
6461
6462             if (eol_seen == EOL_SEEN_NONE)
6463               /* This is the first end-of-line.  */
6464               eol_seen = this_eol;
6465             else if (eol_seen != this_eol)
6466               {
6467                 /* The found type is different from what found before.
6468                    Allow for stray ^M characters in DOS EOL files.  */
6469                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6470                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6471                   eol_seen = EOL_SEEN_CRLF;
6472                 else
6473                   {
6474                     eol_seen = EOL_SEEN_LF;
6475                     break;
6476                   }
6477               }
6478             if (++total == MAX_EOL_CHECK_COUNT)
6479               break;
6480           }
6481       }
6482   return eol_seen;
6483 }
6484
6485
6486 static Lisp_Object
6487 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6488 {
6489   Lisp_Object eol_type;
6490
6491   eol_type = CODING_ID_EOL_TYPE (coding->id);
6492   if (! VECTORP (eol_type))
6493     /* Already adjusted.  */
6494     return eol_type;
6495   if (eol_seen & EOL_SEEN_LF)
6496     {
6497       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6498       eol_type = Qunix;
6499     }
6500   else if (eol_seen & EOL_SEEN_CRLF)
6501     {
6502       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6503       eol_type = Qdos;
6504     }
6505   else if (eol_seen & EOL_SEEN_CR)
6506     {
6507       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6508       eol_type = Qmac;
6509     }
6510   return eol_type;
6511 }
6512
6513 /* Detect how a text specified in CODING is encoded.  If a coding
6514    system is detected, update fields of CODING by the detected coding
6515    system.  */
6516
6517 static void
6518 detect_coding (struct coding_system *coding)
6519 {
6520   const unsigned char *src, *src_end;
6521   unsigned int saved_mode = coding->mode;
6522   Lisp_Object found = Qnil;
6523   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6524
6525   coding->consumed = coding->consumed_char = 0;
6526   coding->produced = coding->produced_char = 0;
6527   coding_set_source (coding);
6528
6529   src_end = coding->source + coding->src_bytes;
6530
6531   coding->eol_seen = EOL_SEEN_NONE;
6532   /* If we have not yet decided the text encoding type, detect it
6533      now.  */
6534   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6535     {
6536       int c, i;
6537       struct coding_detection_info detect_info;
6538       bool null_byte_found = 0, eight_bit_found = 0;
6539       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6540                                        inhibit_null_byte_detection);
6541       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6542                                        inhibit_iso_escape_detection);
6543       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6544
6545       coding->head_ascii = 0;
6546       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6547       for (src = coding->source; src < src_end; src++)
6548         {
6549           c = *src;
6550           if (c & 0x80)
6551             {
6552               eight_bit_found = 1;
6553               if (null_byte_found)
6554                 break;
6555             }
6556           else if (c < 0x20)
6557             {
6558               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6559                   && ! inhibit_ied
6560                   && ! detect_info.checked)
6561                 {
6562                   if (detect_coding_iso_2022 (coding, &detect_info))
6563                     {
6564                       /* We have scanned the whole data.  */
6565                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6566                         {
6567                           /* We didn't find an 8-bit code.  We may
6568                              have found a null-byte, but it's very
6569                              rare that a binary file conforms to
6570                              ISO-2022.  */
6571                           src = src_end;
6572                           coding->head_ascii = src - coding->source;
6573                         }
6574                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6575                       break;
6576                     }
6577                 }
6578               else if (! c && !inhibit_nbd)
6579                 {
6580                   null_byte_found = 1;
6581                   if (eight_bit_found)
6582                     break;
6583                 }
6584               else if (! disable_ascii_optimization
6585                        && ! inhibit_eol_conversion)
6586                 {
6587                   if (c == '\r')
6588                     {
6589                       if (src < src_end && src[1] == '\n')
6590                         {
6591                           coding->eol_seen |= EOL_SEEN_CRLF;
6592                           src++;
6593                           if (! eight_bit_found)
6594                             coding->head_ascii++;
6595                         }
6596                       else
6597                         coding->eol_seen |= EOL_SEEN_CR;
6598                     }
6599                   else if (c == '\n')
6600                     {
6601                       coding->eol_seen |= EOL_SEEN_LF;
6602                     }
6603                 }
6604
6605               if (! eight_bit_found)
6606                 coding->head_ascii++;
6607             }
6608           else if (! eight_bit_found)
6609             coding->head_ascii++;
6610         }
6611
6612       if (null_byte_found || eight_bit_found
6613           || coding->head_ascii < coding->src_bytes
6614           || detect_info.found)
6615         {
6616           enum coding_category category;
6617           struct coding_system *this;
6618
6619           if (coding->head_ascii == coding->src_bytes)
6620             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6621             for (i = 0; i < coding_category_raw_text; i++)
6622               {
6623                 category = coding_priorities[i];
6624                 this = coding_categories + category;
6625                 if (detect_info.found & (1 << category))
6626                   break;
6627               }
6628           else
6629             {
6630               if (null_byte_found)
6631                 {
6632                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6633                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6634                 }
6635               else if (prefer_utf_8
6636                        && detect_coding_utf_8 (coding, &detect_info))
6637                 {
6638                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6639                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6640                 }
6641               for (i = 0; i < coding_category_raw_text; i++)
6642                 {
6643                   category = coding_priorities[i];
6644                   this = coding_categories + category;
6645                   /* Some of this->detector (e.g. detect_coding_sjis)
6646                      require this information.  */
6647                   coding->id = this->id;
6648                   if (this->id < 0)
6649                     {
6650                       /* No coding system of this category is defined.  */
6651                       detect_info.rejected |= (1 << category);
6652                     }
6653                   else if (category >= coding_category_raw_text)
6654                     continue;
6655                   else if (detect_info.checked & (1 << category))
6656                     {
6657                       if (detect_info.found & (1 << category))
6658                         break;
6659                     }
6660                   else if ((*(this->detector)) (coding, &detect_info)
6661                            && detect_info.found & (1 << category))
6662                     break;
6663                 }
6664             }
6665
6666           if (i < coding_category_raw_text)
6667             {
6668               if (category == coding_category_utf_8_auto)
6669                 {
6670                   Lisp_Object coding_systems;
6671
6672                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6673                                          coding_attr_utf_bom);
6674                   if (CONSP (coding_systems))
6675                     {
6676                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6677                         found = XCAR (coding_systems);
6678                       else
6679                         found = XCDR (coding_systems);
6680                     }
6681                   else
6682                     found = CODING_ID_NAME (this->id);
6683                 }
6684               else if (category == coding_category_utf_16_auto)
6685                 {
6686                   Lisp_Object coding_systems;
6687
6688                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6689                                          coding_attr_utf_bom);
6690                   if (CONSP (coding_systems))
6691                     {
6692                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6693                         found = XCAR (coding_systems);
6694                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6695                         found = XCDR (coding_systems);
6696                     }
6697                   else
6698                     found = CODING_ID_NAME (this->id);
6699                 }
6700               else
6701                 found = CODING_ID_NAME (this->id);
6702             }
6703           else if (null_byte_found)
6704             found = Qno_conversion;
6705           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6706                    == CATEGORY_MASK_ANY)
6707             found = Qraw_text;
6708           else if (detect_info.rejected)
6709             for (i = 0; i < coding_category_raw_text; i++)
6710               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6711                 {
6712                   this = coding_categories + coding_priorities[i];
6713                   found = CODING_ID_NAME (this->id);
6714                   break;
6715                 }
6716         }
6717     }
6718   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6719            == coding_category_utf_8_auto)
6720     {
6721       Lisp_Object coding_systems;
6722       struct coding_detection_info detect_info;
6723
6724       coding_systems
6725         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6726       detect_info.found = detect_info.rejected = 0;
6727       if (check_ascii (coding) == coding->src_bytes)
6728         {
6729           if (CONSP (coding_systems))
6730             found = XCDR (coding_systems);
6731         }
6732       else
6733         {
6734           if (CONSP (coding_systems)
6735               && detect_coding_utf_8 (coding, &detect_info))
6736             {
6737               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6738                 found = XCAR (coding_systems);
6739               else
6740                 found = XCDR (coding_systems);
6741             }
6742         }
6743     }
6744   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6745            == coding_category_utf_16_auto)
6746     {
6747       Lisp_Object coding_systems;
6748       struct coding_detection_info detect_info;
6749
6750       coding_systems
6751         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6752       detect_info.found = detect_info.rejected = 0;
6753       coding->head_ascii = 0;
6754       if (CONSP (coding_systems)
6755           && detect_coding_utf_16 (coding, &detect_info))
6756         {
6757           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6758             found = XCAR (coding_systems);
6759           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6760             found = XCDR (coding_systems);
6761         }
6762     }
6763
6764   if (! NILP (found))
6765     {
6766       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6767                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6768                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6769                            : EOL_SEEN_LF);
6770
6771       setup_coding_system (found, coding);
6772       if (specified_eol != EOL_SEEN_NONE)
6773         adjust_coding_eol_type (coding, specified_eol);
6774     }
6775
6776   coding->mode = saved_mode;
6777 }
6778
6779
6780 static void
6781 decode_eol (struct coding_system *coding)
6782 {
6783   Lisp_Object eol_type;
6784   unsigned char *p, *pbeg, *pend;
6785
6786   eol_type = CODING_ID_EOL_TYPE (coding->id);
6787   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6788     return;
6789
6790   if (NILP (coding->dst_object))
6791     pbeg = coding->destination;
6792   else
6793     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6794   pend = pbeg + coding->produced;
6795
6796   if (VECTORP (eol_type))
6797     {
6798       int eol_seen = EOL_SEEN_NONE;
6799
6800       for (p = pbeg; p < pend; p++)
6801         {
6802           if (*p == '\n')
6803             eol_seen |= EOL_SEEN_LF;
6804           else if (*p == '\r')
6805             {
6806               if (p + 1 < pend && *(p + 1) == '\n')
6807                 {
6808                   eol_seen |= EOL_SEEN_CRLF;
6809                   p++;
6810                 }
6811               else
6812                 eol_seen |= EOL_SEEN_CR;
6813             }
6814         }
6815       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6816       if ((eol_seen & EOL_SEEN_CRLF) != 0
6817           && (eol_seen & EOL_SEEN_CR) != 0
6818           && (eol_seen & EOL_SEEN_LF) == 0)
6819         eol_seen = EOL_SEEN_CRLF;
6820       else if (eol_seen != EOL_SEEN_NONE
6821           && eol_seen != EOL_SEEN_LF
6822           && eol_seen != EOL_SEEN_CRLF
6823           && eol_seen != EOL_SEEN_CR)
6824         eol_seen = EOL_SEEN_LF;
6825       if (eol_seen != EOL_SEEN_NONE)
6826         eol_type = adjust_coding_eol_type (coding, eol_seen);
6827     }
6828
6829   if (EQ (eol_type, Qmac))
6830     {
6831       for (p = pbeg; p < pend; p++)
6832         if (*p == '\r')
6833           *p = '\n';
6834     }
6835   else if (EQ (eol_type, Qdos))
6836     {
6837       ptrdiff_t n = 0;
6838
6839       if (NILP (coding->dst_object))
6840         {
6841           /* Start deleting '\r' from the tail to minimize the memory
6842              movement.  */
6843           for (p = pend - 2; p >= pbeg; p--)
6844             if (*p == '\r')
6845               {
6846                 memmove (p, p + 1, pend-- - p - 1);
6847                 n++;
6848               }
6849         }
6850       else
6851         {
6852           ptrdiff_t pos_byte = coding->dst_pos_byte;
6853           ptrdiff_t pos = coding->dst_pos;
6854           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6855
6856           while (pos < pos_end)
6857             {
6858               p = BYTE_POS_ADDR (pos_byte);
6859               if (*p == '\r' && p[1] == '\n')
6860                 {
6861                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6862                   n++;
6863                   pos_end--;
6864                 }
6865               pos++;
6866               if (coding->dst_multibyte)
6867                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6868               else
6869                 pos_byte++;
6870             }
6871         }
6872       coding->produced -= n;
6873       coding->produced_char -= n;
6874     }
6875 }
6876
6877
6878 /* Return a translation table (or list of them) from coding system
6879    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6880    not ENCODEP). */
6881
6882 static Lisp_Object
6883 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6884 {
6885   Lisp_Object standard, translation_table;
6886   Lisp_Object val;
6887
6888   if (NILP (Venable_character_translation))
6889     {
6890       if (max_lookup)
6891         *max_lookup = 0;
6892       return Qnil;
6893     }
6894   if (encodep)
6895     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6896       standard = Vstandard_translation_table_for_encode;
6897   else
6898     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6899       standard = Vstandard_translation_table_for_decode;
6900   if (NILP (translation_table))
6901     translation_table = standard;
6902   else
6903     {
6904       if (SYMBOLP (translation_table))
6905         translation_table = Fget (translation_table, Qtranslation_table);
6906       else if (CONSP (translation_table))
6907         {
6908           translation_table = Fcopy_sequence (translation_table);
6909           for (val = translation_table; CONSP (val); val = XCDR (val))
6910             if (SYMBOLP (XCAR (val)))
6911               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6912         }
6913       if (CHAR_TABLE_P (standard))
6914         {
6915           if (CONSP (translation_table))
6916             translation_table = nconc2 (translation_table, list1 (standard));
6917           else
6918             translation_table = list2 (translation_table, standard);
6919         }
6920     }
6921
6922   if (max_lookup)
6923     {
6924       *max_lookup = 1;
6925       if (CHAR_TABLE_P (translation_table)
6926           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6927         {
6928           val = XCHAR_TABLE (translation_table)->extras[1];
6929           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6930             *max_lookup = XFASTINT (val);
6931         }
6932       else if (CONSP (translation_table))
6933         {
6934           Lisp_Object tail;
6935
6936           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6937             if (CHAR_TABLE_P (XCAR (tail))
6938                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6939               {
6940                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6941                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6942                   *max_lookup = XFASTINT (tailval);
6943               }
6944         }
6945     }
6946   return translation_table;
6947 }
6948
6949 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6950   do {                                                          \
6951     trans = Qnil;                                               \
6952     if (CHAR_TABLE_P (table))                                   \
6953       {                                                         \
6954         trans = CHAR_TABLE_REF (table, c);                      \
6955         if (CHARACTERP (trans))                                 \
6956           c = XFASTINT (trans), trans = Qnil;                   \
6957       }                                                         \
6958     else if (CONSP (table))                                     \
6959       {                                                         \
6960         Lisp_Object tail;                                       \
6961                                                                 \
6962         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6963           if (CHAR_TABLE_P (XCAR (tail)))                       \
6964             {                                                   \
6965               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6966               if (CHARACTERP (trans))                           \
6967                 c = XFASTINT (trans), trans = Qnil;             \
6968               else if (! NILP (trans))                          \
6969                 break;                                          \
6970             }                                                   \
6971       }                                                         \
6972   } while (0)
6973
6974
6975 /* Return a translation of character(s) at BUF according to TRANS.
6976    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6977    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6978    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6979    translation is found, and Qnil if not found..
6980    If BUF is too short to lookup characters in FROM, return Qt.  */
6981
6982 static Lisp_Object
6983 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6984 {
6985
6986   if (INTEGERP (trans))
6987     return trans;
6988   for (; CONSP (trans); trans = XCDR (trans))
6989     {
6990       Lisp_Object val = XCAR (trans);
6991       Lisp_Object from = XCAR (val);
6992       ptrdiff_t len = ASIZE (from);
6993       ptrdiff_t i;
6994
6995       for (i = 0; i < len; i++)
6996         {
6997           if (buf + i == buf_end)
6998             return Qt;
6999           if (XINT (AREF (from, i)) != buf[i])
7000             break;
7001         }
7002       if (i == len)
7003         return val;
7004     }
7005   return Qnil;
7006 }
7007
7008
7009 static int
7010 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7011                bool last_block)
7012 {
7013   unsigned char *dst = coding->destination + coding->produced;
7014   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7015   ptrdiff_t produced;
7016   ptrdiff_t produced_chars = 0;
7017   int carryover = 0;
7018
7019   if (! coding->chars_at_source)
7020     {
7021       /* Source characters are in coding->charbuf.  */
7022       int *buf = coding->charbuf;
7023       int *buf_end = buf + coding->charbuf_used;
7024
7025       if (EQ (coding->src_object, coding->dst_object))
7026         {
7027           coding_set_source (coding);
7028           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7029         }
7030
7031       while (buf < buf_end)
7032         {
7033           int c = *buf;
7034           ptrdiff_t i;
7035
7036           if (c >= 0)
7037             {
7038               ptrdiff_t from_nchars = 1, to_nchars = 1;
7039               Lisp_Object trans = Qnil;
7040
7041               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7042               if (! NILP (trans))
7043                 {
7044                   trans = get_translation (trans, buf, buf_end);
7045                   if (INTEGERP (trans))
7046                     c = XINT (trans);
7047                   else if (CONSP (trans))
7048                     {
7049                       from_nchars = ASIZE (XCAR (trans));
7050                       trans = XCDR (trans);
7051                       if (INTEGERP (trans))
7052                         c = XINT (trans);
7053                       else
7054                         {
7055                           to_nchars = ASIZE (trans);
7056                           c = XINT (AREF (trans, 0));
7057                         }
7058                     }
7059                   else if (EQ (trans, Qt) && ! last_block)
7060                     break;
7061                 }
7062
7063               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7064                 {
7065                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7066                        / MAX_MULTIBYTE_LENGTH)
7067                       < to_nchars)
7068                     memory_full (SIZE_MAX);
7069                   dst = alloc_destination (coding,
7070                                            buf_end - buf
7071                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7072                                            dst);
7073                   if (EQ (coding->src_object, coding->dst_object))
7074                     {
7075                       coding_set_source (coding);
7076                       dst_end = (((unsigned char *) coding->source)
7077                                  + coding->consumed);
7078                     }
7079                   else
7080                     dst_end = coding->destination + coding->dst_bytes;
7081                 }
7082
7083               for (i = 0; i < to_nchars; i++)
7084                 {
7085                   if (i > 0)
7086                     c = XINT (AREF (trans, i));
7087                   if (coding->dst_multibyte
7088                       || ! CHAR_BYTE8_P (c))
7089                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7090                   else
7091                     *dst++ = CHAR_TO_BYTE8 (c);
7092                 }
7093               produced_chars += to_nchars;
7094               buf += from_nchars;
7095             }
7096           else
7097             /* This is an annotation datum.  (-C) is the length.  */
7098             buf += -c;
7099         }
7100       carryover = buf_end - buf;
7101     }
7102   else
7103     {
7104       /* Source characters are at coding->source.  */
7105       const unsigned char *src = coding->source;
7106       const unsigned char *src_end = src + coding->consumed;
7107
7108       if (EQ (coding->dst_object, coding->src_object))
7109         dst_end = (unsigned char *) src;
7110       if (coding->src_multibyte != coding->dst_multibyte)
7111         {
7112           if (coding->src_multibyte)
7113             {
7114               bool multibytep = 1;
7115               ptrdiff_t consumed_chars = 0;
7116
7117               while (1)
7118                 {
7119                   const unsigned char *src_base = src;
7120                   int c;
7121
7122                   ONE_MORE_BYTE (c);
7123                   if (dst == dst_end)
7124                     {
7125                       if (EQ (coding->src_object, coding->dst_object))
7126                         dst_end = (unsigned char *) src;
7127                       if (dst == dst_end)
7128                         {
7129                           ptrdiff_t offset = src - coding->source;
7130
7131                           dst = alloc_destination (coding, src_end - src + 1,
7132                                                    dst);
7133                           dst_end = coding->destination + coding->dst_bytes;
7134                           coding_set_source (coding);
7135                           src = coding->source + offset;
7136                           src_end = coding->source + coding->consumed;
7137                           if (EQ (coding->src_object, coding->dst_object))
7138                             dst_end = (unsigned char *) src;
7139                         }
7140                     }
7141                   *dst++ = c;
7142                   produced_chars++;
7143                 }
7144             no_more_source:
7145               ;
7146             }
7147           else
7148             while (src < src_end)
7149               {
7150                 bool multibytep = 1;
7151                 int c = *src++;
7152
7153                 if (dst >= dst_end - 1)
7154                   {
7155                     if (EQ (coding->src_object, coding->dst_object))
7156                       dst_end = (unsigned char *) src;
7157                     if (dst >= dst_end - 1)
7158                       {
7159                         ptrdiff_t offset = src - coding->source;
7160                         ptrdiff_t more_bytes;
7161
7162                         if (EQ (coding->src_object, coding->dst_object))
7163                           more_bytes = ((src_end - src) / 2) + 2;
7164                         else
7165                           more_bytes = src_end - src + 2;
7166                         dst = alloc_destination (coding, more_bytes, dst);
7167                         dst_end = coding->destination + coding->dst_bytes;
7168                         coding_set_source (coding);
7169                         src = coding->source + offset;
7170                         src_end = coding->source + coding->consumed;
7171                         if (EQ (coding->src_object, coding->dst_object))
7172                           dst_end = (unsigned char *) src;
7173                       }
7174                   }
7175                 EMIT_ONE_BYTE (c);
7176               }
7177         }
7178       else
7179         {
7180           if (!EQ (coding->src_object, coding->dst_object))
7181             {
7182               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7183
7184               if (require > 0)
7185                 {
7186                   ptrdiff_t offset = src - coding->source;
7187
7188                   dst = alloc_destination (coding, require, dst);
7189                   coding_set_source (coding);
7190                   src = coding->source + offset;
7191                   src_end = coding->source + coding->consumed;
7192                 }
7193             }
7194           produced_chars = coding->consumed_char;
7195           while (src < src_end)
7196             *dst++ = *src++;
7197         }
7198     }
7199
7200   produced = dst - (coding->destination + coding->produced);
7201   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7202     insert_from_gap (produced_chars, produced, 0);
7203   coding->produced += produced;
7204   coding->produced_char += produced_chars;
7205   return carryover;
7206 }
7207
7208 /* Compose text in CODING->object according to the annotation data at
7209    CHARBUF.  CHARBUF is an array:
7210      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7211  */
7212
7213 static void
7214 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7215 {
7216   int len;
7217   ptrdiff_t to;
7218   enum composition_method method;
7219   Lisp_Object components;
7220
7221   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7222   to = pos + charbuf[2];
7223   method = (enum composition_method) (charbuf[4]);
7224
7225   if (method == COMPOSITION_RELATIVE)
7226     components = Qnil;
7227   else
7228     {
7229       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7230       int i, j;
7231
7232       if (method == COMPOSITION_WITH_RULE)
7233         len = charbuf[2] * 3 - 2;
7234       charbuf += MAX_ANNOTATION_LENGTH;
7235       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7236       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7237         {
7238           if (charbuf[i] >= 0)
7239             args[j] = make_number (charbuf[i]);
7240           else
7241             {
7242               i++;
7243               args[j] = make_number (charbuf[i] % 0x100);
7244             }
7245         }
7246       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7247     }
7248   compose_text (pos, to, components, Qnil, coding->dst_object);
7249 }
7250
7251
7252 /* Put `charset' property on text in CODING->object according to
7253    the annotation data at CHARBUF.  CHARBUF is an array:
7254      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7255  */
7256
7257 static void
7258 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7259 {
7260   ptrdiff_t from = pos - charbuf[2];
7261   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7262
7263   Fput_text_property (make_number (from), make_number (pos),
7264                       Qcharset, CHARSET_NAME (charset),
7265                       coding->dst_object);
7266 }
7267
7268 #define MAX_CHARBUF_SIZE 0x4000
7269 #define MIN_CHARBUF_SIZE 0x10
7270
7271 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7272   do {                                                          \
7273     int units = ((size) > MAX_CHARBUF_SIZE ? MAX_CHARBUF_SIZE   \
7274                  : (size) < MIN_CHARBUF_SIZE ? MIN_CHARBUF_SIZE \
7275                  : size);                                       \
7276     coding->charbuf = SAFE_ALLOCA ((units) * sizeof (int));     \
7277     coding->charbuf_size = (units);                             \
7278   } while (0)
7279
7280
7281 static void
7282 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7283 {
7284   int *charbuf = coding->charbuf;
7285   int *charbuf_end = charbuf + coding->charbuf_used;
7286
7287   if (NILP (coding->dst_object))
7288     return;
7289
7290   while (charbuf < charbuf_end)
7291     {
7292       if (*charbuf >= 0)
7293         pos++, charbuf++;
7294       else
7295         {
7296           int len = -*charbuf;
7297
7298           if (len > 2)
7299             switch (charbuf[1])
7300               {
7301               case CODING_ANNOTATE_COMPOSITION_MASK:
7302                 produce_composition (coding, charbuf, pos);
7303                 break;
7304               case CODING_ANNOTATE_CHARSET_MASK:
7305                 produce_charset (coding, charbuf, pos);
7306                 break;
7307               }
7308           charbuf += len;
7309         }
7310     }
7311 }
7312
7313 /* Decode the data at CODING->src_object into CODING->dst_object.
7314    CODING->src_object is a buffer, a string, or nil.
7315    CODING->dst_object is a buffer.
7316
7317    If CODING->src_object is a buffer, it must be the current buffer.
7318    In this case, if CODING->src_pos is positive, it is a position of
7319    the source text in the buffer, otherwise, the source text is in the
7320    gap area of the buffer, and CODING->src_pos specifies the offset of
7321    the text from GPT (which must be the same as PT).  If this is the
7322    same buffer as CODING->dst_object, CODING->src_pos must be
7323    negative.
7324
7325    If CODING->src_object is a string, CODING->src_pos is an index to
7326    that string.
7327
7328    If CODING->src_object is nil, CODING->source must already point to
7329    the non-relocatable memory area.  In this case, CODING->src_pos is
7330    an offset from CODING->source.
7331
7332    The decoded data is inserted at the current point of the buffer
7333    CODING->dst_object.
7334 */
7335
7336 static void
7337 decode_coding (struct coding_system *coding)
7338 {
7339   Lisp_Object attrs;
7340   Lisp_Object undo_list;
7341   Lisp_Object translation_table;
7342   struct ccl_spec cclspec;
7343   int carryover;
7344   int i;
7345
7346   USE_SAFE_ALLOCA;
7347
7348   if (BUFFERP (coding->src_object)
7349       && coding->src_pos > 0
7350       && coding->src_pos < GPT
7351       && coding->src_pos + coding->src_chars > GPT)
7352     move_gap_both (coding->src_pos, coding->src_pos_byte);
7353
7354   undo_list = Qt;
7355   if (BUFFERP (coding->dst_object))
7356     {
7357       set_buffer_internal (XBUFFER (coding->dst_object));
7358       if (GPT != PT)
7359         move_gap_both (PT, PT_BYTE);
7360
7361       /* We must disable undo_list in order to record the whole insert
7362          transaction via record_insert at the end.  But doing so also
7363          disables the recording of the first change to the undo_list.
7364          Therefore we check for first change here and record it via
7365          record_first_change if needed.  */
7366       if (MODIFF <= SAVE_MODIFF)
7367         record_first_change ();
7368
7369       undo_list = BVAR (current_buffer, undo_list);
7370       bset_undo_list (current_buffer, Qt);
7371     }
7372
7373   coding->consumed = coding->consumed_char = 0;
7374   coding->produced = coding->produced_char = 0;
7375   coding->chars_at_source = 0;
7376   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7377   coding->errors = 0;
7378
7379   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7380
7381   attrs = CODING_ID_ATTRS (coding->id);
7382   translation_table = get_translation_table (attrs, 0, NULL);
7383
7384   carryover = 0;
7385   if (coding->decoder == decode_coding_ccl)
7386     {
7387       coding->spec.ccl = &cclspec;
7388       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7389     }
7390   do
7391     {
7392       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7393
7394       coding_set_source (coding);
7395       coding->annotated = 0;
7396       coding->charbuf_used = carryover;
7397       (*(coding->decoder)) (coding);
7398       coding_set_destination (coding);
7399       carryover = produce_chars (coding, translation_table, 0);
7400       if (coding->annotated)
7401         produce_annotation (coding, pos);
7402       for (i = 0; i < carryover; i++)
7403         coding->charbuf[i]
7404           = coding->charbuf[coding->charbuf_used - carryover + i];
7405     }
7406   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7407          || (coding->consumed < coding->src_bytes
7408              && (coding->result == CODING_RESULT_SUCCESS
7409                  || coding->result == CODING_RESULT_INVALID_SRC)));
7410
7411   if (carryover > 0)
7412     {
7413       coding_set_destination (coding);
7414       coding->charbuf_used = carryover;
7415       produce_chars (coding, translation_table, 1);
7416     }
7417
7418   coding->carryover_bytes = 0;
7419   if (coding->consumed < coding->src_bytes)
7420     {
7421       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7422       const unsigned char *src;
7423
7424       coding_set_source (coding);
7425       coding_set_destination (coding);
7426       src = coding->source + coding->consumed;
7427
7428       if (coding->mode & CODING_MODE_LAST_BLOCK)
7429         {
7430           /* Flush out unprocessed data as binary chars.  We are sure
7431              that the number of data is less than the size of
7432              coding->charbuf.  */
7433           coding->charbuf_used = 0;
7434           coding->chars_at_source = 0;
7435
7436           while (nbytes-- > 0)
7437             {
7438               int c = *src++;
7439
7440               if (c & 0x80)
7441                 c = BYTE8_TO_CHAR (c);
7442               coding->charbuf[coding->charbuf_used++] = c;
7443             }
7444           produce_chars (coding, Qnil, 1);
7445         }
7446       else
7447         {
7448           /* Record unprocessed bytes in coding->carryover.  We are
7449              sure that the number of data is less than the size of
7450              coding->carryover.  */
7451           unsigned char *p = coding->carryover;
7452
7453           if (nbytes > sizeof coding->carryover)
7454             nbytes = sizeof coding->carryover;
7455           coding->carryover_bytes = nbytes;
7456           while (nbytes-- > 0)
7457             *p++ = *src++;
7458         }
7459       coding->consumed = coding->src_bytes;
7460     }
7461
7462   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7463       && !inhibit_eol_conversion)
7464     decode_eol (coding);
7465   if (BUFFERP (coding->dst_object))
7466     {
7467       bset_undo_list (current_buffer, undo_list);
7468       record_insert (coding->dst_pos, coding->produced_char);
7469     }
7470
7471   SAFE_FREE ();
7472 }
7473
7474
7475 /* Extract an annotation datum from a composition starting at POS and
7476    ending before LIMIT of CODING->src_object (buffer or string), store
7477    the data in BUF, set *STOP to a starting position of the next
7478    composition (if any) or to LIMIT, and return the address of the
7479    next element of BUF.
7480
7481    If such an annotation is not found, set *STOP to a starting
7482    position of a composition after POS (if any) or to LIMIT, and
7483    return BUF.  */
7484
7485 static int *
7486 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7487                                struct coding_system *coding, int *buf,
7488                                ptrdiff_t *stop)
7489 {
7490   ptrdiff_t start, end;
7491   Lisp_Object prop;
7492
7493   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7494       || end > limit)
7495     *stop = limit;
7496   else if (start > pos)
7497     *stop = start;
7498   else
7499     {
7500       if (start == pos)
7501         {
7502           /* We found a composition.  Store the corresponding
7503              annotation data in BUF.  */
7504           int *head = buf;
7505           enum composition_method method = composition_method (prop);
7506           int nchars = COMPOSITION_LENGTH (prop);
7507
7508           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7509           if (method != COMPOSITION_RELATIVE)
7510             {
7511               Lisp_Object components;
7512               ptrdiff_t i, len, i_byte;
7513
7514               components = COMPOSITION_COMPONENTS (prop);
7515               if (VECTORP (components))
7516                 {
7517                   len = ASIZE (components);
7518                   for (i = 0; i < len; i++)
7519                     *buf++ = XINT (AREF (components, i));
7520                 }
7521               else if (STRINGP (components))
7522                 {
7523                   len = SCHARS (components);
7524                   i = i_byte = 0;
7525                   while (i < len)
7526                     {
7527                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7528                       buf++;
7529                     }
7530                 }
7531               else if (INTEGERP (components))
7532                 {
7533                   len = 1;
7534                   *buf++ = XINT (components);
7535                 }
7536               else if (CONSP (components))
7537                 {
7538                   for (len = 0; CONSP (components);
7539                        len++, components = XCDR (components))
7540                     *buf++ = XINT (XCAR (components));
7541                 }
7542               else
7543                 emacs_abort ();
7544               *head -= len;
7545             }
7546         }
7547
7548       if (find_composition (end, limit, &start, &end, &prop,
7549                             coding->src_object)
7550           && end <= limit)
7551         *stop = start;
7552       else
7553         *stop = limit;
7554     }
7555   return buf;
7556 }
7557
7558
7559 /* Extract an annotation datum from a text property `charset' at POS of
7560    CODING->src_object (buffer of string), store the data in BUF, set
7561    *STOP to the position where the value of `charset' property changes
7562    (limiting by LIMIT), and return the address of the next element of
7563    BUF.
7564
7565    If the property value is nil, set *STOP to the position where the
7566    property value is non-nil (limiting by LIMIT), and return BUF.  */
7567
7568 static int *
7569 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7570                            struct coding_system *coding, int *buf,
7571                            ptrdiff_t *stop)
7572 {
7573   Lisp_Object val, next;
7574   int id;
7575
7576   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7577   if (! NILP (val) && CHARSETP (val))
7578     id = XINT (CHARSET_SYMBOL_ID (val));
7579   else
7580     id = -1;
7581   ADD_CHARSET_DATA (buf, 0, id);
7582   next = Fnext_single_property_change (make_number (pos), Qcharset,
7583                                        coding->src_object,
7584                                        make_number (limit));
7585   *stop = XINT (next);
7586   return buf;
7587 }
7588
7589
7590 static void
7591 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7592                int max_lookup)
7593 {
7594   int *buf = coding->charbuf;
7595   int *buf_end = coding->charbuf + coding->charbuf_size;
7596   const unsigned char *src = coding->source + coding->consumed;
7597   const unsigned char *src_end = coding->source + coding->src_bytes;
7598   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7599   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7600   bool multibytep = coding->src_multibyte;
7601   Lisp_Object eol_type;
7602   int c;
7603   ptrdiff_t stop, stop_composition, stop_charset;
7604   int *lookup_buf = NULL;
7605
7606   if (! NILP (translation_table))
7607     lookup_buf = alloca (sizeof (int) * max_lookup);
7608
7609   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7610   if (VECTORP (eol_type))
7611     eol_type = Qunix;
7612
7613   /* Note: composition handling is not yet implemented.  */
7614   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7615
7616   if (NILP (coding->src_object))
7617     stop = stop_composition = stop_charset = end_pos;
7618   else
7619     {
7620       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7621         stop = stop_composition = pos;
7622       else
7623         stop = stop_composition = end_pos;
7624       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7625         stop = stop_charset = pos;
7626       else
7627         stop_charset = end_pos;
7628     }
7629
7630   /* Compensate for CRLF and conversion.  */
7631   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7632   while (buf < buf_end)
7633     {
7634       Lisp_Object trans;
7635
7636       if (pos == stop)
7637         {
7638           if (pos == end_pos)
7639             break;
7640           if (pos == stop_composition)
7641             buf = handle_composition_annotation (pos, end_pos, coding,
7642                                                  buf, &stop_composition);
7643           if (pos == stop_charset)
7644             buf = handle_charset_annotation (pos, end_pos, coding,
7645                                              buf, &stop_charset);
7646           stop = (stop_composition < stop_charset
7647                   ? stop_composition : stop_charset);
7648         }
7649
7650       if (! multibytep)
7651         {
7652           int bytes;
7653
7654           if (coding->encoder == encode_coding_raw_text
7655               || coding->encoder == encode_coding_ccl)
7656             c = *src++, pos++;
7657           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7658             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7659           else
7660             c = BYTE8_TO_CHAR (*src), src++, pos++;
7661         }
7662       else
7663         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7664       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7665         c = '\n';
7666       if (! EQ (eol_type, Qunix))
7667         {
7668           if (c == '\n')
7669             {
7670               if (EQ (eol_type, Qdos))
7671                 *buf++ = '\r';
7672               else
7673                 c = '\r';
7674             }
7675         }
7676
7677       trans = Qnil;
7678       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7679       if (NILP (trans))
7680         *buf++ = c;
7681       else
7682         {
7683           ptrdiff_t from_nchars = 1, to_nchars = 1;
7684           int *lookup_buf_end;
7685           const unsigned char *p = src;
7686           int i;
7687
7688           lookup_buf[0] = c;
7689           for (i = 1; i < max_lookup && p < src_end; i++)
7690             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7691           lookup_buf_end = lookup_buf + i;
7692           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7693           if (INTEGERP (trans))
7694             c = XINT (trans);
7695           else if (CONSP (trans))
7696             {
7697               from_nchars = ASIZE (XCAR (trans));
7698               trans = XCDR (trans);
7699               if (INTEGERP (trans))
7700                 c = XINT (trans);
7701               else
7702                 {
7703                   to_nchars = ASIZE (trans);
7704                   if (buf_end - buf < to_nchars)
7705                     break;
7706                   c = XINT (AREF (trans, 0));
7707                 }
7708             }
7709           else
7710             break;
7711           *buf++ = c;
7712           for (i = 1; i < to_nchars; i++)
7713             *buf++ = XINT (AREF (trans, i));
7714           for (i = 1; i < from_nchars; i++, pos++)
7715             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7716         }
7717     }
7718
7719   coding->consumed = src - coding->source;
7720   coding->consumed_char = pos - coding->src_pos;
7721   coding->charbuf_used = buf - coding->charbuf;
7722   coding->chars_at_source = 0;
7723 }
7724
7725
7726 /* Encode the text at CODING->src_object into CODING->dst_object.
7727    CODING->src_object is a buffer or a string.
7728    CODING->dst_object is a buffer or nil.
7729
7730    If CODING->src_object is a buffer, it must be the current buffer.
7731    In this case, if CODING->src_pos is positive, it is a position of
7732    the source text in the buffer, otherwise. the source text is in the
7733    gap area of the buffer, and coding->src_pos specifies the offset of
7734    the text from GPT (which must be the same as PT).  If this is the
7735    same buffer as CODING->dst_object, CODING->src_pos must be
7736    negative and CODING should not have `pre-write-conversion'.
7737
7738    If CODING->src_object is a string, CODING should not have
7739    `pre-write-conversion'.
7740
7741    If CODING->dst_object is a buffer, the encoded data is inserted at
7742    the current point of that buffer.
7743
7744    If CODING->dst_object is nil, the encoded data is placed at the
7745    memory area specified by CODING->destination.  */
7746
7747 static void
7748 encode_coding (struct coding_system *coding)
7749 {
7750   Lisp_Object attrs;
7751   Lisp_Object translation_table;
7752   int max_lookup;
7753   struct ccl_spec cclspec;
7754
7755   USE_SAFE_ALLOCA;
7756
7757   attrs = CODING_ID_ATTRS (coding->id);
7758   if (coding->encoder == encode_coding_raw_text)
7759     translation_table = Qnil, max_lookup = 0;
7760   else
7761     translation_table = get_translation_table (attrs, 1, &max_lookup);
7762
7763   if (BUFFERP (coding->dst_object))
7764     {
7765       set_buffer_internal (XBUFFER (coding->dst_object));
7766       coding->dst_multibyte
7767         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7768     }
7769
7770   coding->consumed = coding->consumed_char = 0;
7771   coding->produced = coding->produced_char = 0;
7772   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7773   coding->errors = 0;
7774
7775   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7776
7777   if (coding->encoder == encode_coding_ccl)
7778     {
7779       coding->spec.ccl = &cclspec;
7780       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7781     }
7782   do {
7783     coding_set_source (coding);
7784     consume_chars (coding, translation_table, max_lookup);
7785     coding_set_destination (coding);
7786     (*(coding->encoder)) (coding);
7787   } while (coding->consumed_char < coding->src_chars);
7788
7789   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7790     insert_from_gap (coding->produced_char, coding->produced, 0);
7791
7792   SAFE_FREE ();
7793 }
7794
7795
7796 /* Name (or base name) of work buffer for code conversion.  */
7797 static Lisp_Object Vcode_conversion_workbuf_name;
7798
7799 /* A working buffer used by the top level conversion.  Once it is
7800    created, it is never destroyed.  It has the name
7801    Vcode_conversion_workbuf_name.  The other working buffers are
7802    destroyed after the use is finished, and their names are modified
7803    versions of Vcode_conversion_workbuf_name.  */
7804 static Lisp_Object Vcode_conversion_reused_workbuf;
7805
7806 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7807 static bool reused_workbuf_in_use;
7808
7809
7810 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7811    multibyteness of returning buffer.  */
7812
7813 static Lisp_Object
7814 make_conversion_work_buffer (bool multibyte)
7815 {
7816   Lisp_Object name, workbuf;
7817   struct buffer *current;
7818
7819   if (reused_workbuf_in_use)
7820     {
7821       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7822       workbuf = Fget_buffer_create (name);
7823     }
7824   else
7825     {
7826       reused_workbuf_in_use = 1;
7827       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7828         Vcode_conversion_reused_workbuf
7829           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7830       workbuf = Vcode_conversion_reused_workbuf;
7831     }
7832   current = current_buffer;
7833   set_buffer_internal (XBUFFER (workbuf));
7834   /* We can't allow modification hooks to run in the work buffer.  For
7835      instance, directory_files_internal assumes that file decoding
7836      doesn't compile new regexps.  */
7837   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7838   Ferase_buffer ();
7839   bset_undo_list (current_buffer, Qt);
7840   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7841   set_buffer_internal (current);
7842   return workbuf;
7843 }
7844
7845
7846 static void
7847 code_conversion_restore (Lisp_Object arg)
7848 {
7849   Lisp_Object current, workbuf;
7850   struct gcpro gcpro1;
7851
7852   GCPRO1 (arg);
7853   current = XCAR (arg);
7854   workbuf = XCDR (arg);
7855   if (! NILP (workbuf))
7856     {
7857       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7858         reused_workbuf_in_use = 0;
7859       else
7860         Fkill_buffer (workbuf);
7861     }
7862   set_buffer_internal (XBUFFER (current));
7863   UNGCPRO;
7864 }
7865
7866 Lisp_Object
7867 code_conversion_save (bool with_work_buf, bool multibyte)
7868 {
7869   Lisp_Object workbuf = Qnil;
7870
7871   if (with_work_buf)
7872     workbuf = make_conversion_work_buffer (multibyte);
7873   record_unwind_protect (code_conversion_restore,
7874                          Fcons (Fcurrent_buffer (), workbuf));
7875   return workbuf;
7876 }
7877
7878 void
7879 decode_coding_gap (struct coding_system *coding,
7880                    ptrdiff_t chars, ptrdiff_t bytes)
7881 {
7882   dynwind_begin ();
7883   Lisp_Object attrs;
7884
7885   coding->src_object = Fcurrent_buffer ();
7886   coding->src_chars = chars;
7887   coding->src_bytes = bytes;
7888   coding->src_pos = -chars;
7889   coding->src_pos_byte = -bytes;
7890   coding->src_multibyte = chars < bytes;
7891   coding->dst_object = coding->src_object;
7892   coding->dst_pos = PT;
7893   coding->dst_pos_byte = PT_BYTE;
7894   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7895
7896   coding->head_ascii = -1;
7897   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7898   coding->eol_seen = EOL_SEEN_NONE;
7899   if (CODING_REQUIRE_DETECTION (coding))
7900     detect_coding (coding);
7901   attrs = CODING_ID_ATTRS (coding->id);
7902   if (! disable_ascii_optimization
7903       && ! coding->src_multibyte
7904       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7905       && NILP (CODING_ATTR_POST_READ (attrs))
7906       && NILP (get_translation_table (attrs, 0, NULL)))
7907     {
7908       chars = coding->head_ascii;
7909       if (chars < 0)
7910         chars = check_ascii (coding);
7911       if (chars != bytes)
7912         {
7913           /* There exists a non-ASCII byte.  */
7914           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7915               && coding->detected_utf8_bytes == coding->src_bytes)
7916             {
7917               if (coding->detected_utf8_chars >= 0)
7918                 chars = coding->detected_utf8_chars;
7919               else
7920                 chars = check_utf_8 (coding);
7921               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7922                   && coding->head_ascii == 0
7923                   && coding->source[0] == UTF_8_BOM_1
7924                   && coding->source[1] == UTF_8_BOM_2
7925                   && coding->source[2] == UTF_8_BOM_3)
7926                 {
7927                   chars--;
7928                   bytes -= 3;
7929                   coding->src_bytes -= 3;
7930                 }
7931             }
7932           else
7933             chars = -1;
7934         }
7935       if (chars >= 0)
7936         {
7937           Lisp_Object eol_type;
7938
7939           eol_type = CODING_ID_EOL_TYPE (coding->id);
7940           if (VECTORP (eol_type))
7941             {
7942               if (coding->eol_seen != EOL_SEEN_NONE)
7943                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7944             }
7945           if (EQ (eol_type, Qmac))
7946             {
7947               unsigned char *src_end = GAP_END_ADDR;
7948               unsigned char *src = src_end - coding->src_bytes;
7949
7950               while (src < src_end)
7951                 {
7952                   if (*src++ == '\r')
7953                     src[-1] = '\n';
7954                 }
7955             }
7956           else if (EQ (eol_type, Qdos))
7957             {
7958               unsigned char *src = GAP_END_ADDR;
7959               unsigned char *src_beg = src - coding->src_bytes;
7960               unsigned char *dst = src;
7961               ptrdiff_t diff;
7962
7963               while (src_beg < src)
7964                 {
7965                   *--dst = *--src;
7966                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7967                     src--;
7968                 }
7969               diff = dst - src;
7970               bytes -= diff;
7971               chars -= diff;
7972             }
7973           coding->produced = bytes;
7974           coding->produced_char = chars;
7975           insert_from_gap (chars, bytes, 1);
7976           dynwind_end ();
7977           return;
7978         }
7979     }
7980   code_conversion_save (0, 0);
7981
7982   coding->mode |= CODING_MODE_LAST_BLOCK;
7983   current_buffer->text->inhibit_shrinking = 1;
7984   decode_coding (coding);
7985   current_buffer->text->inhibit_shrinking = 0;
7986
7987   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7988     {
7989       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7990       Lisp_Object val;
7991
7992       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7993       val = call1 (CODING_ATTR_POST_READ (attrs),
7994                    make_number (coding->produced_char));
7995       CHECK_NATNUM (val);
7996       coding->produced_char += Z - prev_Z;
7997       coding->produced += Z_BYTE - prev_Z_BYTE;
7998     }
7999
8000   dynwind_end ();
8001 }
8002
8003
8004 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8005    SRC_OBJECT into DST_OBJECT by coding context CODING.
8006
8007    SRC_OBJECT is a buffer, a string, or Qnil.
8008
8009    If it is a buffer, the text is at point of the buffer.  FROM and TO
8010    are positions in the buffer.
8011
8012    If it is a string, the text is at the beginning of the string.
8013    FROM and TO are indices to the string.
8014
8015    If it is nil, the text is at coding->source.  FROM and TO are
8016    indices to coding->source.
8017
8018    DST_OBJECT is a buffer, Qt, or Qnil.
8019
8020    If it is a buffer, the decoded text is inserted at point of the
8021    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8022    is deleted.
8023
8024    If it is Qt, a string is made from the decoded text, and
8025    set in CODING->dst_object.
8026
8027    If it is Qnil, the decoded text is stored at CODING->destination.
8028    The caller must allocate CODING->dst_bytes bytes at
8029    CODING->destination by xmalloc.  If the decoded text is longer than
8030    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8031  */
8032
8033 void
8034 decode_coding_object (struct coding_system *coding,
8035                       Lisp_Object src_object,
8036                       ptrdiff_t from, ptrdiff_t from_byte,
8037                       ptrdiff_t to, ptrdiff_t to_byte,
8038                       Lisp_Object dst_object)
8039 {
8040   dynwind_begin ();
8041   unsigned char *destination IF_LINT (= NULL);
8042   ptrdiff_t dst_bytes IF_LINT (= 0);
8043   ptrdiff_t chars = to - from;
8044   ptrdiff_t bytes = to_byte - from_byte;
8045   Lisp_Object attrs;
8046   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8047   bool need_marker_adjustment = 0;
8048   Lisp_Object old_deactivate_mark;
8049
8050   old_deactivate_mark = Vdeactivate_mark;
8051
8052   if (NILP (dst_object))
8053     {
8054       destination = coding->destination;
8055       dst_bytes = coding->dst_bytes;
8056     }
8057
8058   coding->src_object = src_object;
8059   coding->src_chars = chars;
8060   coding->src_bytes = bytes;
8061   coding->src_multibyte = chars < bytes;
8062
8063   if (STRINGP (src_object))
8064     {
8065       coding->src_pos = from;
8066       coding->src_pos_byte = from_byte;
8067     }
8068   else if (BUFFERP (src_object))
8069     {
8070       set_buffer_internal (XBUFFER (src_object));
8071       if (from != GPT)
8072         move_gap_both (from, from_byte);
8073       if (EQ (src_object, dst_object))
8074         {
8075           struct Lisp_Marker *tail;
8076
8077           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8078             {
8079               tail->need_adjustment
8080                 = tail->charpos == (tail->insertion_type ? from : to);
8081               need_marker_adjustment |= tail->need_adjustment;
8082             }
8083           saved_pt = PT, saved_pt_byte = PT_BYTE;
8084           TEMP_SET_PT_BOTH (from, from_byte);
8085           current_buffer->text->inhibit_shrinking = 1;
8086           del_range_both (from, from_byte, to, to_byte, 1);
8087           coding->src_pos = -chars;
8088           coding->src_pos_byte = -bytes;
8089         }
8090       else
8091         {
8092           coding->src_pos = from;
8093           coding->src_pos_byte = from_byte;
8094         }
8095     }
8096
8097   if (CODING_REQUIRE_DETECTION (coding))
8098     detect_coding (coding);
8099   attrs = CODING_ID_ATTRS (coding->id);
8100
8101   if (EQ (dst_object, Qt)
8102       || (! NILP (CODING_ATTR_POST_READ (attrs))
8103           && NILP (dst_object)))
8104     {
8105       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8106       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8107       coding->dst_pos = BEG;
8108       coding->dst_pos_byte = BEG_BYTE;
8109     }
8110   else if (BUFFERP (dst_object))
8111     {
8112       code_conversion_save (0, 0);
8113       coding->dst_object = dst_object;
8114       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8115       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8116       coding->dst_multibyte
8117         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8118     }
8119   else
8120     {
8121       code_conversion_save (0, 0);
8122       coding->dst_object = Qnil;
8123       /* Most callers presume this will return a multibyte result, and they
8124          won't use `binary' or `raw-text' anyway, so let's not worry about
8125          CODING_FOR_UNIBYTE.  */
8126       coding->dst_multibyte = 1;
8127     }
8128
8129   decode_coding (coding);
8130
8131   if (BUFFERP (coding->dst_object))
8132     set_buffer_internal (XBUFFER (coding->dst_object));
8133
8134   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8135     {
8136       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8137       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8138       Lisp_Object val;
8139
8140       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8141       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8142               old_deactivate_mark);
8143       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8144                         make_number (coding->produced_char));
8145       UNGCPRO;
8146       CHECK_NATNUM (val);
8147       coding->produced_char += Z - prev_Z;
8148       coding->produced += Z_BYTE - prev_Z_BYTE;
8149     }
8150
8151   if (EQ (dst_object, Qt))
8152     {
8153       coding->dst_object = Fbuffer_string ();
8154     }
8155   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8156     {
8157       set_buffer_internal (XBUFFER (coding->dst_object));
8158       if (dst_bytes < coding->produced)
8159         {
8160           eassert (coding->produced > 0);
8161           destination = xrealloc (destination, coding->produced);
8162           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8163             move_gap_both (BEGV, BEGV_BYTE);
8164           memcpy (destination, BEGV_ADDR, coding->produced);
8165           coding->destination = destination;
8166         }
8167     }
8168
8169   if (saved_pt >= 0)
8170     {
8171       /* This is the case of:
8172          (BUFFERP (src_object) && EQ (src_object, dst_object))
8173          As we have moved PT while replacing the original buffer
8174          contents, we must recover it now.  */
8175       set_buffer_internal (XBUFFER (src_object));
8176       current_buffer->text->inhibit_shrinking = 0;
8177       if (saved_pt < from)
8178         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8179       else if (saved_pt < from + chars)
8180         TEMP_SET_PT_BOTH (from, from_byte);
8181       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8182         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8183                           saved_pt_byte + (coding->produced - bytes));
8184       else
8185         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8186                           saved_pt_byte + (coding->produced - bytes));
8187
8188       if (need_marker_adjustment)
8189         {
8190           struct Lisp_Marker *tail;
8191
8192           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8193             if (tail->need_adjustment)
8194               {
8195                 tail->need_adjustment = 0;
8196                 if (tail->insertion_type)
8197                   {
8198                     tail->bytepos = from_byte;
8199                     tail->charpos = from;
8200                   }
8201                 else
8202                   {
8203                     tail->bytepos = from_byte + coding->produced;
8204                     tail->charpos
8205                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8206                          ? tail->bytepos : from + coding->produced_char);
8207                   }
8208               }
8209         }
8210     }
8211
8212   Vdeactivate_mark = old_deactivate_mark;
8213   dynwind_end ();
8214 }
8215
8216
8217 void
8218 encode_coding_object (struct coding_system *coding,
8219                       Lisp_Object src_object,
8220                       ptrdiff_t from, ptrdiff_t from_byte,
8221                       ptrdiff_t to, ptrdiff_t to_byte,
8222                       Lisp_Object dst_object)
8223 {
8224   dynwind_begin ();
8225   ptrdiff_t chars = to - from;
8226   ptrdiff_t bytes = to_byte - from_byte;
8227   Lisp_Object attrs;
8228   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8229   bool need_marker_adjustment = 0;
8230   bool kill_src_buffer = 0;
8231   Lisp_Object old_deactivate_mark;
8232
8233   old_deactivate_mark = Vdeactivate_mark;
8234
8235   coding->src_object = src_object;
8236   coding->src_chars = chars;
8237   coding->src_bytes = bytes;
8238   coding->src_multibyte = chars < bytes;
8239
8240   attrs = CODING_ID_ATTRS (coding->id);
8241
8242   if (EQ (src_object, dst_object))
8243     {
8244       struct Lisp_Marker *tail;
8245
8246       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8247         {
8248           tail->need_adjustment
8249             = tail->charpos == (tail->insertion_type ? from : to);
8250           need_marker_adjustment |= tail->need_adjustment;
8251         }
8252     }
8253
8254   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8255     {
8256       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8257       set_buffer_internal (XBUFFER (coding->src_object));
8258       if (STRINGP (src_object))
8259         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8260       else if (BUFFERP (src_object))
8261         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8262       else
8263         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8264
8265       if (EQ (src_object, dst_object))
8266         {
8267           set_buffer_internal (XBUFFER (src_object));
8268           saved_pt = PT, saved_pt_byte = PT_BYTE;
8269           del_range_both (from, from_byte, to, to_byte, 1);
8270           set_buffer_internal (XBUFFER (coding->src_object));
8271         }
8272
8273       {
8274         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8275
8276         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8277                 old_deactivate_mark);
8278         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8279                     make_number (BEG), make_number (Z));
8280         UNGCPRO;
8281       }
8282       if (XBUFFER (coding->src_object) != current_buffer)
8283         kill_src_buffer = 1;
8284       coding->src_object = Fcurrent_buffer ();
8285       if (BEG != GPT)
8286         move_gap_both (BEG, BEG_BYTE);
8287       coding->src_chars = Z - BEG;
8288       coding->src_bytes = Z_BYTE - BEG_BYTE;
8289       coding->src_pos = BEG;
8290       coding->src_pos_byte = BEG_BYTE;
8291       coding->src_multibyte = Z < Z_BYTE;
8292     }
8293   else if (STRINGP (src_object))
8294     {
8295       code_conversion_save (0, 0);
8296       coding->src_pos = from;
8297       coding->src_pos_byte = from_byte;
8298     }
8299   else if (BUFFERP (src_object))
8300     {
8301       code_conversion_save (0, 0);
8302       set_buffer_internal (XBUFFER (src_object));
8303       if (EQ (src_object, dst_object))
8304         {
8305           saved_pt = PT, saved_pt_byte = PT_BYTE;
8306           coding->src_object = del_range_1 (from, to, 1, 1);
8307           coding->src_pos = 0;
8308           coding->src_pos_byte = 0;
8309         }
8310       else
8311         {
8312           if (from < GPT && to >= GPT)
8313             move_gap_both (from, from_byte);
8314           coding->src_pos = from;
8315           coding->src_pos_byte = from_byte;
8316         }
8317     }
8318   else
8319     code_conversion_save (0, 0);
8320
8321   if (BUFFERP (dst_object))
8322     {
8323       coding->dst_object = dst_object;
8324       if (EQ (src_object, dst_object))
8325         {
8326           coding->dst_pos = from;
8327           coding->dst_pos_byte = from_byte;
8328         }
8329       else
8330         {
8331           struct buffer *current = current_buffer;
8332
8333           set_buffer_temp (XBUFFER (dst_object));
8334           coding->dst_pos = PT;
8335           coding->dst_pos_byte = PT_BYTE;
8336           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8337           set_buffer_temp (current);
8338         }
8339       coding->dst_multibyte
8340         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8341     }
8342   else if (EQ (dst_object, Qt))
8343     {
8344       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8345       coding->dst_object = Qnil;
8346       coding->destination = xmalloc_atomic (dst_bytes);
8347       coding->dst_bytes = dst_bytes;
8348       coding->dst_multibyte = 0;
8349     }
8350   else
8351     {
8352       coding->dst_object = Qnil;
8353       coding->dst_multibyte = 0;
8354     }
8355
8356   encode_coding (coding);
8357
8358   if (EQ (dst_object, Qt))
8359     {
8360       if (BUFFERP (coding->dst_object))
8361         coding->dst_object = Fbuffer_string ();
8362       else if (coding->raw_destination)
8363         /* This is used to avoid creating huge Lisp string.
8364            NOTE: caller who sets `raw_destination' is also
8365            responsible for freeing `destination' buffer.  */
8366         coding->dst_object = Qnil;
8367       else
8368         {
8369           coding->dst_object
8370             = make_unibyte_string ((char *) coding->destination,
8371                                    coding->produced);
8372           xfree (coding->destination);
8373         }
8374     }
8375
8376   if (saved_pt >= 0)
8377     {
8378       /* This is the case of:
8379          (BUFFERP (src_object) && EQ (src_object, dst_object))
8380          As we have moved PT while replacing the original buffer
8381          contents, we must recover it now.  */
8382       set_buffer_internal (XBUFFER (src_object));
8383       if (saved_pt < from)
8384         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8385       else if (saved_pt < from + chars)
8386         TEMP_SET_PT_BOTH (from, from_byte);
8387       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8388         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8389                           saved_pt_byte + (coding->produced - bytes));
8390       else
8391         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8392                           saved_pt_byte + (coding->produced - bytes));
8393
8394       if (need_marker_adjustment)
8395         {
8396           struct Lisp_Marker *tail;
8397
8398           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8399             if (tail->need_adjustment)
8400               {
8401                 tail->need_adjustment = 0;
8402                 if (tail->insertion_type)
8403                   {
8404                     tail->bytepos = from_byte;
8405                     tail->charpos = from;
8406                   }
8407                 else
8408                   {
8409                     tail->bytepos = from_byte + coding->produced;
8410                     tail->charpos
8411                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8412                          ? tail->bytepos : from + coding->produced_char);
8413                   }
8414               }
8415         }
8416     }
8417
8418   if (kill_src_buffer)
8419     Fkill_buffer (coding->src_object);
8420
8421   Vdeactivate_mark = old_deactivate_mark;
8422   dynwind_end ();
8423 }
8424
8425
8426 Lisp_Object
8427 preferred_coding_system (void)
8428 {
8429   int id = coding_categories[coding_priorities[0]].id;
8430
8431   return CODING_ID_NAME (id);
8432 }
8433
8434 #if defined (WINDOWSNT) || defined (CYGWIN)
8435
8436 Lisp_Object
8437 from_unicode (Lisp_Object str)
8438 {
8439   CHECK_STRING (str);
8440   if (!STRING_MULTIBYTE (str) &&
8441       SBYTES (str) & 1)
8442     {
8443       str = Fsubstring (str, make_number (0), make_number (-1));
8444     }
8445
8446   return code_convert_string_norecord (str, Qutf_16le, 0);
8447 }
8448
8449 Lisp_Object
8450 from_unicode_buffer (const wchar_t *wstr)
8451 {
8452     return from_unicode (
8453         make_unibyte_string (
8454             (char *) wstr,
8455             /* we get one of the two final 0 bytes for free. */
8456             1 + sizeof (wchar_t) * wcslen (wstr)));
8457 }
8458
8459 wchar_t *
8460 to_unicode (Lisp_Object str, Lisp_Object *buf)
8461 {
8462   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8463   /* We need to make another copy (in addition to the one made by
8464      code_convert_string_norecord) to ensure that the final string is
8465      _doubly_ zero terminated --- that is, that the string is
8466      terminated by two zero bytes and one utf-16le null character.
8467      Because strings are already terminated with a single zero byte,
8468      we just add one additional zero. */
8469   str = make_uninit_string (SBYTES (*buf) + 1);
8470   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8471   SDATA (str) [SBYTES (*buf)] = '\0';
8472   *buf = str;
8473   return WCSDATA (*buf);
8474 }
8475
8476 #endif /* WINDOWSNT || CYGWIN */
8477
8478 \f
8479 #ifdef emacs
8480 /*** 8. Emacs Lisp library functions ***/
8481
8482 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8483        doc: /* Return t if OBJECT is nil or a coding-system.
8484 See the documentation of `define-coding-system' for information
8485 about coding-system objects.  */)
8486   (Lisp_Object object)
8487 {
8488   if (NILP (object)
8489       || CODING_SYSTEM_ID (object) >= 0)
8490     return Qt;
8491   if (! SYMBOLP (object)
8492       || NILP (Fget (object, Qcoding_system_define_form)))
8493     return Qnil;
8494   return Qt;
8495 }
8496
8497 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8498        Sread_non_nil_coding_system, 1, 1, 0,
8499        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8500   (Lisp_Object prompt)
8501 {
8502   Lisp_Object val;
8503   do
8504     {
8505       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8506                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8507     }
8508   while (SCHARS (val) == 0);
8509   return (Fintern (val, Qnil));
8510 }
8511
8512 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8513        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8514 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8515 Ignores case when completing coding systems (all Emacs coding systems
8516 are lower-case).  */)
8517   (Lisp_Object prompt, Lisp_Object default_coding_system)
8518 {
8519   Lisp_Object val;
8520   dynwind_begin ();
8521
8522   if (SYMBOLP (default_coding_system))
8523     default_coding_system = SYMBOL_NAME (default_coding_system);
8524   specbind (Qcompletion_ignore_case, Qt);
8525   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8526                           Qt, Qnil, Qcoding_system_history,
8527                           default_coding_system, Qnil);
8528   dynwind_end ();
8529   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8530 }
8531
8532 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8533        1, 1, 0,
8534        doc: /* Check validity of CODING-SYSTEM.
8535 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8536 It is valid if it is nil or a symbol defined as a coding system by the
8537 function `define-coding-system'.  */)
8538   (Lisp_Object coding_system)
8539 {
8540   Lisp_Object define_form;
8541
8542   define_form = Fget (coding_system, Qcoding_system_define_form);
8543   if (! NILP (define_form))
8544     {
8545       Fput (coding_system, Qcoding_system_define_form, Qnil);
8546       safe_eval (define_form);
8547     }
8548   if (!NILP (Fcoding_system_p (coding_system)))
8549     return coding_system;
8550   xsignal1 (Qcoding_system_error, coding_system);
8551 }
8552
8553 \f
8554 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8555    HIGHEST, return the coding system of the highest
8556    priority among the detected coding systems.  Otherwise return a
8557    list of detected coding systems sorted by their priorities.  If
8558    MULTIBYTEP, it is assumed that the bytes are in correct
8559    multibyte form but contains only ASCII and eight-bit chars.
8560    Otherwise, the bytes are raw bytes.
8561
8562    CODING-SYSTEM controls the detection as below:
8563
8564    If it is nil, detect both text-format and eol-format.  If the
8565    text-format part of CODING-SYSTEM is already specified
8566    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8567    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8568    detect only text-format.  */
8569
8570 Lisp_Object
8571 detect_coding_system (const unsigned char *src,
8572                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8573                       bool highest, bool multibytep,
8574                       Lisp_Object coding_system)
8575 {
8576   const unsigned char *src_end = src + src_bytes;
8577   Lisp_Object attrs, eol_type;
8578   Lisp_Object val = Qnil;
8579   struct coding_system coding;
8580   ptrdiff_t id;
8581   struct coding_detection_info detect_info;
8582   enum coding_category base_category;
8583   bool null_byte_found = 0, eight_bit_found = 0;
8584
8585   if (NILP (coding_system))
8586     coding_system = Qundecided;
8587   setup_coding_system (coding_system, &coding);
8588   attrs = CODING_ID_ATTRS (coding.id);
8589   eol_type = CODING_ID_EOL_TYPE (coding.id);
8590   coding_system = CODING_ATTR_BASE_NAME (attrs);
8591
8592   coding.source = src;
8593   coding.src_chars = src_chars;
8594   coding.src_bytes = src_bytes;
8595   coding.src_multibyte = multibytep;
8596   coding.consumed = 0;
8597   coding.mode |= CODING_MODE_LAST_BLOCK;
8598   coding.head_ascii = 0;
8599
8600   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8601
8602   /* At first, detect text-format if necessary.  */
8603   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8604   if (base_category == coding_category_undecided)
8605     {
8606       enum coding_category category IF_LINT (= 0);
8607       struct coding_system *this IF_LINT (= NULL);
8608       int c, i;
8609       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8610                                        inhibit_null_byte_detection);
8611       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8612                                        inhibit_iso_escape_detection);
8613       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8614
8615       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8616       for (; src < src_end; src++)
8617         {
8618           c = *src;
8619           if (c & 0x80)
8620             {
8621               eight_bit_found = 1;
8622               if (null_byte_found)
8623                 break;
8624             }
8625           else if (c < 0x20)
8626             {
8627               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8628                   && ! inhibit_ied
8629                   && ! detect_info.checked)
8630                 {
8631                   if (detect_coding_iso_2022 (&coding, &detect_info))
8632                     {
8633                       /* We have scanned the whole data.  */
8634                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8635                         {
8636                           /* We didn't find an 8-bit code.  We may
8637                              have found a null-byte, but it's very
8638                              rare that a binary file confirm to
8639                              ISO-2022.  */
8640                           src = src_end;
8641                           coding.head_ascii = src - coding.source;
8642                         }
8643                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8644                       break;
8645                     }
8646                 }
8647               else if (! c && !inhibit_nbd)
8648                 {
8649                   null_byte_found = 1;
8650                   if (eight_bit_found)
8651                     break;
8652                 }
8653               if (! eight_bit_found)
8654                 coding.head_ascii++;
8655             }
8656           else if (! eight_bit_found)
8657             coding.head_ascii++;
8658         }
8659
8660       if (null_byte_found || eight_bit_found
8661           || coding.head_ascii < coding.src_bytes
8662           || detect_info.found)
8663         {
8664           if (coding.head_ascii == coding.src_bytes)
8665             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8666             for (i = 0; i < coding_category_raw_text; i++)
8667               {
8668                 category = coding_priorities[i];
8669                 this = coding_categories + category;
8670                 if (detect_info.found & (1 << category))
8671                   break;
8672               }
8673           else
8674             {
8675               if (null_byte_found)
8676                 {
8677                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8678                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8679                 }
8680               else if (prefer_utf_8
8681                        && detect_coding_utf_8 (&coding, &detect_info))
8682                 {
8683                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8684                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8685                 }
8686               for (i = 0; i < coding_category_raw_text; i++)
8687                 {
8688                   category = coding_priorities[i];
8689                   this = coding_categories + category;
8690
8691                   if (this->id < 0)
8692                     {
8693                       /* No coding system of this category is defined.  */
8694                       detect_info.rejected |= (1 << category);
8695                     }
8696                   else if (category >= coding_category_raw_text)
8697                     continue;
8698                   else if (detect_info.checked & (1 << category))
8699                     {
8700                       if (highest
8701                           && (detect_info.found & (1 << category)))
8702                         break;
8703                     }
8704                   else if ((*(this->detector)) (&coding, &detect_info)
8705                            && highest
8706                            && (detect_info.found & (1 << category)))
8707                     {
8708                       if (category == coding_category_utf_16_auto)
8709                         {
8710                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8711                             category = coding_category_utf_16_le;
8712                           else
8713                             category = coding_category_utf_16_be;
8714                         }
8715                       break;
8716                     }
8717                 }
8718             }
8719         }
8720
8721       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8722           || null_byte_found)
8723         {
8724           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8725           id = CODING_SYSTEM_ID (Qno_conversion);
8726           val = list1 (make_number (id));
8727         }
8728       else if (! detect_info.rejected && ! detect_info.found)
8729         {
8730           detect_info.found = CATEGORY_MASK_ANY;
8731           id = coding_categories[coding_category_undecided].id;
8732           val = list1 (make_number (id));
8733         }
8734       else if (highest)
8735         {
8736           if (detect_info.found)
8737             {
8738               detect_info.found = 1 << category;
8739               val = list1 (make_number (this->id));
8740             }
8741           else
8742             for (i = 0; i < coding_category_raw_text; i++)
8743               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8744                 {
8745                   detect_info.found = 1 << coding_priorities[i];
8746                   id = coding_categories[coding_priorities[i]].id;
8747                   val = list1 (make_number (id));
8748                   break;
8749                 }
8750         }
8751       else
8752         {
8753           int mask = detect_info.rejected | detect_info.found;
8754           int found = 0;
8755
8756           for (i = coding_category_raw_text - 1; i >= 0; i--)
8757             {
8758               category = coding_priorities[i];
8759               if (! (mask & (1 << category)))
8760                 {
8761                   found |= 1 << category;
8762                   id = coding_categories[category].id;
8763                   if (id >= 0)
8764                     val = list1 (make_number (id));
8765                 }
8766             }
8767           for (i = coding_category_raw_text - 1; i >= 0; i--)
8768             {
8769               category = coding_priorities[i];
8770               if (detect_info.found & (1 << category))
8771                 {
8772                   id = coding_categories[category].id;
8773                   val = Fcons (make_number (id), val);
8774                 }
8775             }
8776           detect_info.found |= found;
8777         }
8778     }
8779   else if (base_category == coding_category_utf_8_auto)
8780     {
8781       if (detect_coding_utf_8 (&coding, &detect_info))
8782         {
8783           struct coding_system *this;
8784
8785           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8786             this = coding_categories + coding_category_utf_8_sig;
8787           else
8788             this = coding_categories + coding_category_utf_8_nosig;
8789           val = list1 (make_number (this->id));
8790         }
8791     }
8792   else if (base_category == coding_category_utf_16_auto)
8793     {
8794       if (detect_coding_utf_16 (&coding, &detect_info))
8795         {
8796           struct coding_system *this;
8797
8798           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8799             this = coding_categories + coding_category_utf_16_le;
8800           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8801             this = coding_categories + coding_category_utf_16_be;
8802           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8803             this = coding_categories + coding_category_utf_16_be_nosig;
8804           else
8805             this = coding_categories + coding_category_utf_16_le_nosig;
8806           val = list1 (make_number (this->id));
8807         }
8808     }
8809   else
8810     {
8811       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8812       val = list1 (make_number (coding.id));
8813     }
8814
8815   /* Then, detect eol-format if necessary.  */
8816   {
8817     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8818     Lisp_Object tail;
8819
8820     if (VECTORP (eol_type))
8821       {
8822         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8823           {
8824             if (null_byte_found)
8825               normal_eol = EOL_SEEN_LF;
8826             else
8827               normal_eol = detect_eol (coding.source, src_bytes,
8828                                        coding_category_raw_text);
8829           }
8830         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8831                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8832           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8833                                       coding_category_utf_16_be);
8834         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8835                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8836           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8837                                       coding_category_utf_16_le);
8838       }
8839     else
8840       {
8841         if (EQ (eol_type, Qunix))
8842           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8843         else if (EQ (eol_type, Qdos))
8844           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8845         else
8846           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8847       }
8848
8849     for (tail = val; CONSP (tail); tail = XCDR (tail))
8850       {
8851         enum coding_category category;
8852         int this_eol;
8853
8854         id = XINT (XCAR (tail));
8855         attrs = CODING_ID_ATTRS (id);
8856         category = XINT (CODING_ATTR_CATEGORY (attrs));
8857         eol_type = CODING_ID_EOL_TYPE (id);
8858         if (VECTORP (eol_type))
8859           {
8860             if (category == coding_category_utf_16_be
8861                 || category == coding_category_utf_16_be_nosig)
8862               this_eol = utf_16_be_eol;
8863             else if (category == coding_category_utf_16_le
8864                      || category == coding_category_utf_16_le_nosig)
8865               this_eol = utf_16_le_eol;
8866             else
8867               this_eol = normal_eol;
8868
8869             if (this_eol == EOL_SEEN_LF)
8870               XSETCAR (tail, AREF (eol_type, 0));
8871             else if (this_eol == EOL_SEEN_CRLF)
8872               XSETCAR (tail, AREF (eol_type, 1));
8873             else if (this_eol == EOL_SEEN_CR)
8874               XSETCAR (tail, AREF (eol_type, 2));
8875             else
8876               XSETCAR (tail, CODING_ID_NAME (id));
8877           }
8878         else
8879           XSETCAR (tail, CODING_ID_NAME (id));
8880       }
8881   }
8882
8883   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8884 }
8885
8886
8887 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8888        2, 3, 0,
8889        doc: /* Detect coding system of the text in the region between START and END.
8890 Return a list of possible coding systems ordered by priority.
8891 The coding systems to try and their priorities follows what
8892 the function `coding-system-priority-list' (which see) returns.
8893
8894 If only ASCII characters are found (except for such ISO-2022 control
8895 characters as ESC), it returns a list of single element `undecided'
8896 or its subsidiary coding system according to a detected end-of-line
8897 format.
8898
8899 If optional argument HIGHEST is non-nil, return the coding system of
8900 highest priority.  */)
8901   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8902 {
8903   ptrdiff_t from, to;
8904   ptrdiff_t from_byte, to_byte;
8905
8906   validate_region (&start, &end);
8907   from = XINT (start), to = XINT (end);
8908   from_byte = CHAR_TO_BYTE (from);
8909   to_byte = CHAR_TO_BYTE (to);
8910
8911   if (from < GPT && to >= GPT)
8912     move_gap_both (to, to_byte);
8913
8914   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8915                                to - from, to_byte - from_byte,
8916                                !NILP (highest),
8917                                !NILP (BVAR (current_buffer
8918                                       , enable_multibyte_characters)),
8919                                Qnil);
8920 }
8921
8922 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8923        1, 2, 0,
8924        doc: /* Detect coding system of the text in STRING.
8925 Return a list of possible coding systems ordered by priority.
8926 The coding systems to try and their priorities follows what
8927 the function `coding-system-priority-list' (which see) returns.
8928
8929 If only ASCII characters are found (except for such ISO-2022 control
8930 characters as ESC), it returns a list of single element `undecided'
8931 or its subsidiary coding system according to a detected end-of-line
8932 format.
8933
8934 If optional argument HIGHEST is non-nil, return the coding system of
8935 highest priority.  */)
8936   (Lisp_Object string, Lisp_Object highest)
8937 {
8938   CHECK_STRING (string);
8939
8940   return detect_coding_system (SDATA (string),
8941                                SCHARS (string), SBYTES (string),
8942                                !NILP (highest), STRING_MULTIBYTE (string),
8943                                Qnil);
8944 }
8945
8946
8947 static bool
8948 char_encodable_p (int c, Lisp_Object attrs)
8949 {
8950   Lisp_Object tail;
8951   struct charset *charset;
8952   Lisp_Object translation_table;
8953
8954   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8955   if (! NILP (translation_table))
8956     c = translate_char (translation_table, c);
8957   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8958        CONSP (tail); tail = XCDR (tail))
8959     {
8960       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8961       if (CHAR_CHARSET_P (c, charset))
8962         break;
8963     }
8964   return (! NILP (tail));
8965 }
8966
8967
8968 /* Return a list of coding systems that safely encode the text between
8969    START and END.  If EXCLUDE is non-nil, it is a list of coding
8970    systems not to check.  The returned list doesn't contain any such
8971    coding systems.  In any case, if the text contains only ASCII or is
8972    unibyte, return t.  */
8973
8974 DEFUN ("find-coding-systems-region-internal",
8975        Ffind_coding_systems_region_internal,
8976        Sfind_coding_systems_region_internal, 2, 3, 0,
8977        doc: /* Internal use only.  */)
8978   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8979 {
8980   Lisp_Object coding_attrs_list, safe_codings;
8981   ptrdiff_t start_byte, end_byte;
8982   const unsigned char *p, *pbeg, *pend;
8983   int c;
8984   Lisp_Object tail, elt, work_table;
8985
8986   if (STRINGP (start))
8987     {
8988       if (!STRING_MULTIBYTE (start)
8989           || SCHARS (start) == SBYTES (start))
8990         return Qt;
8991       start_byte = 0;
8992       end_byte = SBYTES (start);
8993     }
8994   else
8995     {
8996       CHECK_NUMBER_COERCE_MARKER (start);
8997       CHECK_NUMBER_COERCE_MARKER (end);
8998       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8999         args_out_of_range (start, end);
9000       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9001         return Qt;
9002       start_byte = CHAR_TO_BYTE (XINT (start));
9003       end_byte = CHAR_TO_BYTE (XINT (end));
9004       if (XINT (end) - XINT (start) == end_byte - start_byte)
9005         return Qt;
9006
9007       if (XINT (start) < GPT && XINT (end) > GPT)
9008         {
9009           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9010             move_gap_both (XINT (start), start_byte);
9011           else
9012             move_gap_both (XINT (end), end_byte);
9013         }
9014     }
9015
9016   coding_attrs_list = Qnil;
9017   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9018     if (NILP (exclude)
9019         || NILP (Fmemq (XCAR (tail), exclude)))
9020       {
9021         Lisp_Object attrs;
9022
9023         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9024         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9025           {
9026             ASET (attrs, coding_attr_trans_tbl,
9027                   get_translation_table (attrs, 1, NULL));
9028             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9029           }
9030       }
9031
9032   if (STRINGP (start))
9033     p = pbeg = SDATA (start);
9034   else
9035     p = pbeg = BYTE_POS_ADDR (start_byte);
9036   pend = p + (end_byte - start_byte);
9037
9038   while (p < pend && ASCII_CHAR_P (*p)) p++;
9039   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9040
9041   work_table = Fmake_char_table (Qnil, Qnil);
9042   while (p < pend)
9043     {
9044       if (ASCII_CHAR_P (*p))
9045         p++;
9046       else
9047         {
9048           c = STRING_CHAR_ADVANCE (p);
9049           if (!NILP (char_table_ref (work_table, c)))
9050             /* This character was already checked.  Ignore it.  */
9051             continue;
9052
9053           charset_map_loaded = 0;
9054           for (tail = coding_attrs_list; CONSP (tail);)
9055             {
9056               elt = XCAR (tail);
9057               if (NILP (elt))
9058                 tail = XCDR (tail);
9059               else if (char_encodable_p (c, elt))
9060                 tail = XCDR (tail);
9061               else if (CONSP (XCDR (tail)))
9062                 {
9063                   XSETCAR (tail, XCAR (XCDR (tail)));
9064                   XSETCDR (tail, XCDR (XCDR (tail)));
9065                 }
9066               else
9067                 {
9068                   XSETCAR (tail, Qnil);
9069                   tail = XCDR (tail);
9070                 }
9071             }
9072           if (charset_map_loaded)
9073             {
9074               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9075
9076               if (STRINGP (start))
9077                 pbeg = SDATA (start);
9078               else
9079                 pbeg = BYTE_POS_ADDR (start_byte);
9080               p = pbeg + p_offset;
9081               pend = pbeg + pend_offset;
9082             }
9083           char_table_set (work_table, c, Qt);
9084         }
9085     }
9086
9087   safe_codings = list2 (Qraw_text, Qno_conversion);
9088   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9089     if (! NILP (XCAR (tail)))
9090       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9091
9092   return safe_codings;
9093 }
9094
9095
9096 DEFUN ("unencodable-char-position", Funencodable_char_position,
9097        Sunencodable_char_position, 3, 5, 0,
9098        doc: /* Return position of first un-encodable character in a region.
9099 START and END specify the region and CODING-SYSTEM specifies the
9100 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9101
9102 If optional 4th argument COUNT is non-nil, it specifies at most how
9103 many un-encodable characters to search.  In this case, the value is a
9104 list of positions.
9105
9106 If optional 5th argument STRING is non-nil, it is a string to search
9107 for un-encodable characters.  In that case, START and END are indexes
9108 to the string and treated as in `substring'.  */)
9109   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9110    Lisp_Object count, Lisp_Object string)
9111 {
9112   EMACS_INT n;
9113   struct coding_system coding;
9114   Lisp_Object attrs, charset_list, translation_table;
9115   Lisp_Object positions;
9116   ptrdiff_t from, to;
9117   const unsigned char *p, *stop, *pend;
9118   bool ascii_compatible;
9119
9120   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9121   attrs = CODING_ID_ATTRS (coding.id);
9122   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9123     return Qnil;
9124   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9125   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9126   translation_table = get_translation_table (attrs, 1, NULL);
9127
9128   if (NILP (string))
9129     {
9130       validate_region (&start, &end);
9131       from = XINT (start);
9132       to = XINT (end);
9133       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9134           || (ascii_compatible
9135               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9136         return Qnil;
9137       p = CHAR_POS_ADDR (from);
9138       pend = CHAR_POS_ADDR (to);
9139       if (from < GPT && to >= GPT)
9140         stop = GPT_ADDR;
9141       else
9142         stop = pend;
9143     }
9144   else
9145     {
9146       CHECK_STRING (string);
9147       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9148       if (! STRING_MULTIBYTE (string))
9149         return Qnil;
9150       p = SDATA (string) + string_char_to_byte (string, from);
9151       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9152       if (ascii_compatible && (to - from) == (pend - p))
9153         return Qnil;
9154     }
9155
9156   if (NILP (count))
9157     n = 1;
9158   else
9159     {
9160       CHECK_NATNUM (count);
9161       n = XINT (count);
9162     }
9163
9164   positions = Qnil;
9165   charset_map_loaded = 0;
9166   while (1)
9167     {
9168       int c;
9169
9170       if (ascii_compatible)
9171         while (p < stop && ASCII_CHAR_P (*p))
9172           p++, from++;
9173       if (p >= stop)
9174         {
9175           if (p >= pend)
9176             break;
9177           stop = pend;
9178           p = GAP_END_ADDR;
9179         }
9180
9181       c = STRING_CHAR_ADVANCE (p);
9182       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9183           && ! char_charset (translate_char (translation_table, c),
9184                              charset_list, NULL))
9185         {
9186           positions = Fcons (make_number (from), positions);
9187           n--;
9188           if (n == 0)
9189             break;
9190         }
9191
9192       from++;
9193       if (charset_map_loaded && NILP (string))
9194         {
9195           p = CHAR_POS_ADDR (from);
9196           pend = CHAR_POS_ADDR (to);
9197           if (from < GPT && to >= GPT)
9198             stop = GPT_ADDR;
9199           else
9200             stop = pend;
9201           charset_map_loaded = 0;
9202         }
9203     }
9204
9205   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9206 }
9207
9208
9209 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9210        Scheck_coding_systems_region, 3, 3, 0,
9211        doc: /* Check if the region is encodable by coding systems.
9212
9213 START and END are buffer positions specifying the region.
9214 CODING-SYSTEM-LIST is a list of coding systems to check.
9215
9216 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9217 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9218 whole region, POS0, POS1, ... are buffer positions where non-encodable
9219 characters are found.
9220
9221 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9222 value is nil.
9223
9224 START may be a string.  In that case, check if the string is
9225 encodable, and the value contains indices to the string instead of
9226 buffer positions.  END is ignored.
9227
9228 If the current buffer (or START if it is a string) is unibyte, the value
9229 is nil.  */)
9230   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9231 {
9232   Lisp_Object list;
9233   ptrdiff_t start_byte, end_byte;
9234   ptrdiff_t pos;
9235   const unsigned char *p, *pbeg, *pend;
9236   int c;
9237   Lisp_Object tail, elt, attrs;
9238
9239   if (STRINGP (start))
9240     {
9241       if (!STRING_MULTIBYTE (start)
9242           || SCHARS (start) == SBYTES (start))
9243         return Qnil;
9244       start_byte = 0;
9245       end_byte = SBYTES (start);
9246       pos = 0;
9247     }
9248   else
9249     {
9250       CHECK_NUMBER_COERCE_MARKER (start);
9251       CHECK_NUMBER_COERCE_MARKER (end);
9252       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9253         args_out_of_range (start, end);
9254       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9255         return Qnil;
9256       start_byte = CHAR_TO_BYTE (XINT (start));
9257       end_byte = CHAR_TO_BYTE (XINT (end));
9258       if (XINT (end) - XINT (start) == end_byte - start_byte)
9259         return Qnil;
9260
9261       if (XINT (start) < GPT && XINT (end) > GPT)
9262         {
9263           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9264             move_gap_both (XINT (start), start_byte);
9265           else
9266             move_gap_both (XINT (end), end_byte);
9267         }
9268       pos = XINT (start);
9269     }
9270
9271   list = Qnil;
9272   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9273     {
9274       elt = XCAR (tail);
9275       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9276       ASET (attrs, coding_attr_trans_tbl,
9277             get_translation_table (attrs, 1, NULL));
9278       list = Fcons (list2 (elt, attrs), list);
9279     }
9280
9281   if (STRINGP (start))
9282     p = pbeg = SDATA (start);
9283   else
9284     p = pbeg = BYTE_POS_ADDR (start_byte);
9285   pend = p + (end_byte - start_byte);
9286
9287   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9288   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9289
9290   while (p < pend)
9291     {
9292       if (ASCII_CHAR_P (*p))
9293         p++;
9294       else
9295         {
9296           c = STRING_CHAR_ADVANCE (p);
9297
9298           charset_map_loaded = 0;
9299           for (tail = list; CONSP (tail); tail = XCDR (tail))
9300             {
9301               elt = XCDR (XCAR (tail));
9302               if (! char_encodable_p (c, XCAR (elt)))
9303                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9304             }
9305           if (charset_map_loaded)
9306             {
9307               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9308
9309               if (STRINGP (start))
9310                 pbeg = SDATA (start);
9311               else
9312                 pbeg = BYTE_POS_ADDR (start_byte);
9313               p = pbeg + p_offset;
9314               pend = pbeg + pend_offset;
9315             }
9316         }
9317       pos++;
9318     }
9319
9320   tail = list;
9321   list = Qnil;
9322   for (; CONSP (tail); tail = XCDR (tail))
9323     {
9324       elt = XCAR (tail);
9325       if (CONSP (XCDR (XCDR (elt))))
9326         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9327                       list);
9328     }
9329
9330   return list;
9331 }
9332
9333
9334 static Lisp_Object
9335 code_convert_region (Lisp_Object start, Lisp_Object end,
9336                      Lisp_Object coding_system, Lisp_Object dst_object,
9337                      bool encodep, bool norecord)
9338 {
9339   struct coding_system coding;
9340   ptrdiff_t from, from_byte, to, to_byte;
9341   Lisp_Object src_object;
9342
9343   if (NILP (coding_system))
9344     coding_system = Qno_conversion;
9345   else
9346     CHECK_CODING_SYSTEM (coding_system);
9347   src_object = Fcurrent_buffer ();
9348   if (NILP (dst_object))
9349     dst_object = src_object;
9350   else if (! EQ (dst_object, Qt))
9351     CHECK_BUFFER (dst_object);
9352
9353   validate_region (&start, &end);
9354   from = XFASTINT (start);
9355   from_byte = CHAR_TO_BYTE (from);
9356   to = XFASTINT (end);
9357   to_byte = CHAR_TO_BYTE (to);
9358
9359   setup_coding_system (coding_system, &coding);
9360   coding.mode |= CODING_MODE_LAST_BLOCK;
9361
9362   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9363     {
9364       struct buffer *buf = XBUFFER (dst_object);
9365       ptrdiff_t buf_pt = BUF_PT (buf);
9366
9367       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9368     }
9369
9370   if (encodep)
9371     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9372                           dst_object);
9373   else
9374     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9375                           dst_object);
9376   if (! norecord)
9377     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9378
9379   return (BUFFERP (dst_object)
9380           ? make_number (coding.produced_char)
9381           : coding.dst_object);
9382 }
9383
9384
9385 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9386        3, 4, "r\nzCoding system: ",
9387        doc: /* Decode the current region from the specified coding system.
9388 When called from a program, takes four arguments:
9389         START, END, CODING-SYSTEM, and DESTINATION.
9390 START and END are buffer positions.
9391
9392 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9393 If nil, the region between START and END is replaced by the decoded text.
9394 If buffer, the decoded text is inserted in that buffer after point (point
9395 does not move).
9396 In those cases, the length of the decoded text is returned.
9397 If DESTINATION is t, the decoded text is returned.
9398
9399 This function sets `last-coding-system-used' to the precise coding system
9400 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9401 not fully specified.)  */)
9402   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9403 {
9404   return code_convert_region (start, end, coding_system, destination, 0, 0);
9405 }
9406
9407 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9408        3, 4, "r\nzCoding system: ",
9409        doc: /* Encode the current region by specified coding system.
9410 When called from a program, takes four arguments:
9411         START, END, CODING-SYSTEM and DESTINATION.
9412 START and END are buffer positions.
9413
9414 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9415 If nil, the region between START and END is replace by the encoded text.
9416 If buffer, the encoded text is inserted in that buffer after point (point
9417 does not move).
9418 In those cases, the length of the encoded text is returned.
9419 If DESTINATION is t, the encoded text is returned.
9420
9421 This function sets `last-coding-system-used' to the precise coding system
9422 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9423 not fully specified.)  */)
9424   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9425 {
9426   return code_convert_region (start, end, coding_system, destination, 1, 0);
9427 }
9428
9429 Lisp_Object
9430 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9431                      Lisp_Object dst_object, bool encodep, bool nocopy,
9432                      bool norecord)
9433 {
9434   struct coding_system coding;
9435   ptrdiff_t chars, bytes;
9436
9437   CHECK_STRING (string);
9438   if (NILP (coding_system))
9439     {
9440       if (! norecord)
9441         Vlast_coding_system_used = Qno_conversion;
9442       if (NILP (dst_object))
9443         return (nocopy ? Fcopy_sequence (string) : string);
9444     }
9445
9446   if (NILP (coding_system))
9447     coding_system = Qno_conversion;
9448   else
9449     CHECK_CODING_SYSTEM (coding_system);
9450   if (NILP (dst_object))
9451     dst_object = Qt;
9452   else if (! EQ (dst_object, Qt))
9453     CHECK_BUFFER (dst_object);
9454
9455   setup_coding_system (coding_system, &coding);
9456   coding.mode |= CODING_MODE_LAST_BLOCK;
9457   chars = SCHARS (string);
9458   bytes = SBYTES (string);
9459
9460   if (BUFFERP (dst_object))
9461     {
9462       struct buffer *buf = XBUFFER (dst_object);
9463       ptrdiff_t buf_pt = BUF_PT (buf);
9464
9465       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9466     }
9467
9468   if (encodep)
9469     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9470   else
9471     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9472   if (! norecord)
9473     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9474
9475   return (BUFFERP (dst_object)
9476           ? make_number (coding.produced_char)
9477           : coding.dst_object);
9478 }
9479
9480
9481 /* Encode or decode STRING according to CODING_SYSTEM.
9482    Do not set Vlast_coding_system_used.
9483
9484    This function is called only from macros DECODE_FILE and
9485    ENCODE_FILE, thus we ignore character composition.  */
9486
9487 Lisp_Object
9488 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9489                               bool encodep)
9490 {
9491   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9492 }
9493
9494 /* Encode or decode a file name, to or from a unibyte string suitable
9495    for passing to C library functions.  */
9496 Lisp_Object
9497 decode_file_name (Lisp_Object fname)
9498 {
9499 #ifdef WINDOWSNT
9500   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9501      converts the file names either to UTF-16LE or to the system ANSI
9502      codepage internally, depending on the underlying OS; see w32.c.  */
9503   if (! NILP (Fcoding_system_p (Qutf_8)))
9504     return code_convert_string_norecord (fname, Qutf_8, 0);
9505   return fname;
9506 #else  /* !WINDOWSNT */
9507   if (! NILP (Vfile_name_coding_system))
9508     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9509   else if (! NILP (Vdefault_file_name_coding_system))
9510     return code_convert_string_norecord (fname,
9511                                          Vdefault_file_name_coding_system, 0);
9512   else
9513     return fname;
9514 #endif
9515 }
9516
9517 Lisp_Object
9518 encode_file_name (Lisp_Object fname)
9519 {
9520   /* This is especially important during bootstrap and dumping, when
9521      file-name encoding is not yet known, and therefore any non-ASCII
9522      file names are unibyte strings, and could only be thrashed if we
9523      try to encode them.  */
9524   if (!STRING_MULTIBYTE (fname))
9525     return fname;
9526 #ifdef WINDOWSNT
9527   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9528      converts the file names either to UTF-16LE or to the system ANSI
9529      codepage internally, depending on the underlying OS; see w32.c.  */
9530   if (! NILP (Fcoding_system_p (Qutf_8)))
9531     return code_convert_string_norecord (fname, Qutf_8, 1);
9532   return fname;
9533 #else  /* !WINDOWSNT */
9534   if (! NILP (Vfile_name_coding_system))
9535     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9536   else if (! NILP (Vdefault_file_name_coding_system))
9537     return code_convert_string_norecord (fname,
9538                                          Vdefault_file_name_coding_system, 1);
9539   else
9540     return fname;
9541 #endif
9542 }
9543
9544 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9545        2, 4, 0,
9546        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9547
9548 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9549 if the decoding operation is trivial.
9550
9551 Optional fourth arg BUFFER non-nil means that the decoded text is
9552 inserted in that buffer after point (point does not move).  In this
9553 case, the return value is the length of the decoded text.
9554
9555 This function sets `last-coding-system-used' to the precise coding system
9556 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9557 not fully specified.)  */)
9558   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9559 {
9560   return code_convert_string (string, coding_system, buffer,
9561                               0, ! NILP (nocopy), 0);
9562 }
9563
9564 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9565        2, 4, 0,
9566        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9567
9568 Optional third arg NOCOPY non-nil means it is OK to return STRING
9569 itself if the encoding operation is trivial.
9570
9571 Optional fourth arg BUFFER non-nil means that the encoded text is
9572 inserted in that buffer after point (point does not move).  In this
9573 case, the return value is the length of the encoded text.
9574
9575 This function sets `last-coding-system-used' to the precise coding system
9576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9577 not fully specified.)  */)
9578   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9579 {
9580   return code_convert_string (string, coding_system, buffer,
9581                               1, ! NILP (nocopy), 0);
9582 }
9583
9584 \f
9585 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9586        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9587 Return the corresponding character.  */)
9588   (Lisp_Object code)
9589 {
9590   Lisp_Object spec, attrs, val;
9591   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9592   EMACS_INT ch;
9593   int c;
9594
9595   CHECK_NATNUM (code);
9596   ch = XFASTINT (code);
9597   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9598   attrs = AREF (spec, 0);
9599
9600   if (ASCII_CHAR_P (ch)
9601       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9602     return code;
9603
9604   val = CODING_ATTR_CHARSET_LIST (attrs);
9605   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9606   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9607   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9608
9609   if (ch <= 0x7F)
9610     {
9611       c = ch;
9612       charset = charset_roman;
9613     }
9614   else if (ch >= 0xA0 && ch < 0xDF)
9615     {
9616       c = ch - 0x80;
9617       charset = charset_kana;
9618     }
9619   else
9620     {
9621       EMACS_INT c1 = ch >> 8;
9622       int c2 = ch & 0xFF;
9623
9624       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9625           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9626         error ("Invalid code: %"pI"d", ch);
9627       c = ch;
9628       SJIS_TO_JIS (c);
9629       charset = charset_kanji;
9630     }
9631   c = DECODE_CHAR (charset, c);
9632   if (c < 0)
9633     error ("Invalid code: %"pI"d", ch);
9634   return make_number (c);
9635 }
9636
9637
9638 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9639        doc: /* Encode a Japanese character CH to shift_jis encoding.
9640 Return the corresponding code in SJIS.  */)
9641   (Lisp_Object ch)
9642 {
9643   Lisp_Object spec, attrs, charset_list;
9644   int c;
9645   struct charset *charset;
9646   unsigned code;
9647
9648   CHECK_CHARACTER (ch);
9649   c = XFASTINT (ch);
9650   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9651   attrs = AREF (spec, 0);
9652
9653   if (ASCII_CHAR_P (c)
9654       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9655     return ch;
9656
9657   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9658   charset = char_charset (c, charset_list, &code);
9659   if (code == CHARSET_INVALID_CODE (charset))
9660     error ("Can't encode by shift_jis encoding: %c", c);
9661   JIS_TO_SJIS (code);
9662
9663   return make_number (code);
9664 }
9665
9666 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9667        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9668 Return the corresponding character.  */)
9669   (Lisp_Object code)
9670 {
9671   Lisp_Object spec, attrs, val;
9672   struct charset *charset_roman, *charset_big5, *charset;
9673   EMACS_INT ch;
9674   int c;
9675
9676   CHECK_NATNUM (code);
9677   ch = XFASTINT (code);
9678   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9679   attrs = AREF (spec, 0);
9680
9681   if (ASCII_CHAR_P (ch)
9682       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9683     return code;
9684
9685   val = CODING_ATTR_CHARSET_LIST (attrs);
9686   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9687   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9688
9689   if (ch <= 0x7F)
9690     {
9691       c = ch;
9692       charset = charset_roman;
9693     }
9694   else
9695     {
9696       EMACS_INT b1 = ch >> 8;
9697       int b2 = ch & 0x7F;
9698       if (b1 < 0xA1 || b1 > 0xFE
9699           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9700         error ("Invalid code: %"pI"d", ch);
9701       c = ch;
9702       charset = charset_big5;
9703     }
9704   c = DECODE_CHAR (charset, c);
9705   if (c < 0)
9706     error ("Invalid code: %"pI"d", ch);
9707   return make_number (c);
9708 }
9709
9710 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9711        doc: /* Encode the Big5 character CH to BIG5 coding system.
9712 Return the corresponding character code in Big5.  */)
9713   (Lisp_Object ch)
9714 {
9715   Lisp_Object spec, attrs, charset_list;
9716   struct charset *charset;
9717   int c;
9718   unsigned code;
9719
9720   CHECK_CHARACTER (ch);
9721   c = XFASTINT (ch);
9722   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9723   attrs = AREF (spec, 0);
9724   if (ASCII_CHAR_P (c)
9725       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9726     return ch;
9727
9728   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9729   charset = char_charset (c, charset_list, &code);
9730   if (code == CHARSET_INVALID_CODE (charset))
9731     error ("Can't encode by Big5 encoding: %c", c);
9732
9733   return make_number (code);
9734 }
9735
9736 \f
9737 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9738        Sset_terminal_coding_system_internal, 1, 2, 0,
9739        doc: /* Internal use only.  */)
9740   (Lisp_Object coding_system, Lisp_Object terminal)
9741 {
9742   struct terminal *term = get_terminal (terminal, 1);
9743   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9744   CHECK_SYMBOL (coding_system);
9745   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9746   /* We had better not send unsafe characters to terminal.  */
9747   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9748   /* Character composition should be disabled.  */
9749   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9750   terminal_coding->src_multibyte = 1;
9751   terminal_coding->dst_multibyte = 0;
9752   tset_charset_list
9753     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9754             ? coding_charset_list (terminal_coding)
9755             : list1 (make_number (charset_ascii))));
9756   return Qnil;
9757 }
9758
9759 DEFUN ("set-safe-terminal-coding-system-internal",
9760        Fset_safe_terminal_coding_system_internal,
9761        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9762        doc: /* Internal use only.  */)
9763   (Lisp_Object coding_system)
9764 {
9765   CHECK_SYMBOL (coding_system);
9766   setup_coding_system (Fcheck_coding_system (coding_system),
9767                        &safe_terminal_coding);
9768   /* Character composition should be disabled.  */
9769   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9770   safe_terminal_coding.src_multibyte = 1;
9771   safe_terminal_coding.dst_multibyte = 0;
9772   return Qnil;
9773 }
9774
9775 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9776        Sterminal_coding_system, 0, 1, 0,
9777        doc: /* Return coding system specified for terminal output on the given terminal.
9778 TERMINAL may be a terminal object, a frame, or nil for the selected
9779 frame's terminal device.  */)
9780   (Lisp_Object terminal)
9781 {
9782   struct coding_system *terminal_coding
9783     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9784   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9785
9786   /* For backward compatibility, return nil if it is `undecided'.  */
9787   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9788 }
9789
9790 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9791        Sset_keyboard_coding_system_internal, 1, 2, 0,
9792        doc: /* Internal use only.  */)
9793   (Lisp_Object coding_system, Lisp_Object terminal)
9794 {
9795   struct terminal *t = get_terminal (terminal, 1);
9796   CHECK_SYMBOL (coding_system);
9797   if (NILP (coding_system))
9798     coding_system = Qno_conversion;
9799   else
9800     Fcheck_coding_system (coding_system);
9801   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9802   /* Character composition should be disabled.  */
9803   TERMINAL_KEYBOARD_CODING (t)->common_flags
9804     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9805   return Qnil;
9806 }
9807
9808 DEFUN ("keyboard-coding-system",
9809        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9810        doc: /* Return coding system specified for decoding keyboard input.  */)
9811   (Lisp_Object terminal)
9812 {
9813   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9814                          (get_terminal (terminal, 1))->id);
9815 }
9816
9817 \f
9818 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9819        Sfind_operation_coding_system,  1, MANY, 0,
9820        doc: /* Choose a coding system for an operation based on the target name.
9821 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9822 DECODING-SYSTEM is the coding system to use for decoding
9823 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9824 for encoding (in case OPERATION does encoding).
9825
9826 The first argument OPERATION specifies an I/O primitive:
9827   For file I/O, `insert-file-contents' or `write-region'.
9828   For process I/O, `call-process', `call-process-region', or `start-process'.
9829   For network I/O, `open-network-stream'.
9830
9831 The remaining arguments should be the same arguments that were passed
9832 to the primitive.  Depending on which primitive, one of those arguments
9833 is selected as the TARGET.  For example, if OPERATION does file I/O,
9834 whichever argument specifies the file name is TARGET.
9835
9836 TARGET has a meaning which depends on OPERATION:
9837   For file I/O, TARGET is a file name (except for the special case below).
9838   For process I/O, TARGET is a process name.
9839   For network I/O, TARGET is a service name or a port number.
9840
9841 This function looks up what is specified for TARGET in
9842 `file-coding-system-alist', `process-coding-system-alist',
9843 or `network-coding-system-alist' depending on OPERATION.
9844 They may specify a coding system, a cons of coding systems,
9845 or a function symbol to call.
9846 In the last case, we call the function with one argument,
9847 which is a list of all the arguments given to this function.
9848 If the function can't decide a coding system, it can return
9849 `undecided' so that the normal code-detection is performed.
9850
9851 If OPERATION is `insert-file-contents', the argument corresponding to
9852 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9853 file name to look up, and BUFFER is a buffer that contains the file's
9854 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9855 function to call for FILENAME, that function should examine the
9856 contents of BUFFER instead of reading the file.
9857
9858 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9859   (ptrdiff_t nargs, Lisp_Object *args)
9860 {
9861   Lisp_Object operation, target_idx, target, val;
9862   register Lisp_Object chain;
9863
9864   if (nargs < 2)
9865     error ("Too few arguments");
9866   operation = args[0];
9867   if (!SYMBOLP (operation)
9868       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9869     error ("Invalid first argument");
9870   if (nargs <= 1 + XFASTINT (target_idx))
9871     error ("Too few arguments for operation `%s'",
9872            SDATA (SYMBOL_NAME (operation)));
9873   target = args[XFASTINT (target_idx) + 1];
9874   if (!(STRINGP (target)
9875         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9876             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9877         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9878     error ("Invalid argument %"pI"d of operation `%s'",
9879            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9880   if (CONSP (target))
9881     target = XCAR (target);
9882
9883   chain = ((EQ (operation, Qinsert_file_contents)
9884             || EQ (operation, Qwrite_region))
9885            ? Vfile_coding_system_alist
9886            : (EQ (operation, Qopen_network_stream)
9887               ? Vnetwork_coding_system_alist
9888               : Vprocess_coding_system_alist));
9889   if (NILP (chain))
9890     return Qnil;
9891
9892   for (; CONSP (chain); chain = XCDR (chain))
9893     {
9894       Lisp_Object elt;
9895
9896       elt = XCAR (chain);
9897       if (CONSP (elt)
9898           && ((STRINGP (target)
9899                && STRINGP (XCAR (elt))
9900                && fast_string_match (XCAR (elt), target) >= 0)
9901               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9902         {
9903           val = XCDR (elt);
9904           /* Here, if VAL is both a valid coding system and a valid
9905              function symbol, we return VAL as a coding system.  */
9906           if (CONSP (val))
9907             return val;
9908           if (! SYMBOLP (val))
9909             return Qnil;
9910           if (! NILP (Fcoding_system_p (val)))
9911             return Fcons (val, val);
9912           if (! NILP (Ffboundp (val)))
9913             {
9914               /* We use call1 rather than safe_call1
9915                  so as to get bug reports about functions called here
9916                  which don't handle the current interface.  */
9917               val = call1 (val, Flist (nargs, args));
9918               if (CONSP (val))
9919                 return val;
9920               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9921                 return Fcons (val, val);
9922             }
9923           return Qnil;
9924         }
9925     }
9926   return Qnil;
9927 }
9928
9929 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9930        Sset_coding_system_priority, 0, MANY, 0,
9931        doc: /* Assign higher priority to the coding systems given as arguments.
9932 If multiple coding systems belong to the same category,
9933 all but the first one are ignored.
9934
9935 usage: (set-coding-system-priority &rest coding-systems)  */)
9936   (ptrdiff_t nargs, Lisp_Object *args)
9937 {
9938   ptrdiff_t i, j;
9939   bool changed[coding_category_max];
9940   enum coding_category priorities[coding_category_max];
9941
9942   memset (changed, 0, sizeof changed);
9943
9944   for (i = j = 0; i < nargs; i++)
9945     {
9946       enum coding_category category;
9947       Lisp_Object spec, attrs;
9948
9949       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9950       attrs = AREF (spec, 0);
9951       category = XINT (CODING_ATTR_CATEGORY (attrs));
9952       if (changed[category])
9953         /* Ignore this coding system because a coding system of the
9954            same category already had a higher priority.  */
9955         continue;
9956       changed[category] = 1;
9957       priorities[j++] = category;
9958       if (coding_categories[category].id >= 0
9959           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9960         setup_coding_system (args[i], &coding_categories[category]);
9961       Fset (AREF (Vcoding_category_table, category), args[i]);
9962     }
9963
9964   /* Now we have decided top J priorities.  Reflect the order of the
9965      original priorities to the remaining priorities.  */
9966
9967   for (i = j, j = 0; i < coding_category_max; i++, j++)
9968     {
9969       while (j < coding_category_max
9970              && changed[coding_priorities[j]])
9971         j++;
9972       if (j == coding_category_max)
9973         emacs_abort ();
9974       priorities[i] = coding_priorities[j];
9975     }
9976
9977   memcpy (coding_priorities, priorities, sizeof priorities);
9978
9979   /* Update `coding-category-list'.  */
9980   Vcoding_category_list = Qnil;
9981   for (i = coding_category_max; i-- > 0; )
9982     Vcoding_category_list
9983       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9984                Vcoding_category_list);
9985
9986   return Qnil;
9987 }
9988
9989 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9990        Scoding_system_priority_list, 0, 1, 0,
9991        doc: /* Return a list of coding systems ordered by their priorities.
9992 The list contains a subset of coding systems; i.e. coding systems
9993 assigned to each coding category (see `coding-category-list').
9994
9995 HIGHESTP non-nil means just return the highest priority one.  */)
9996   (Lisp_Object highestp)
9997 {
9998   int i;
9999   Lisp_Object val;
10000
10001   for (i = 0, val = Qnil; i < coding_category_max; i++)
10002     {
10003       enum coding_category category = coding_priorities[i];
10004       int id = coding_categories[category].id;
10005       Lisp_Object attrs;
10006
10007       if (id < 0)
10008         continue;
10009       attrs = CODING_ID_ATTRS (id);
10010       if (! NILP (highestp))
10011         return CODING_ATTR_BASE_NAME (attrs);
10012       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10013     }
10014   return Fnreverse (val);
10015 }
10016
10017 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10018
10019 static Lisp_Object
10020 make_subsidiaries (Lisp_Object base)
10021 {
10022   Lisp_Object subsidiaries;
10023   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10024   char *buf = alloca (base_name_len + 6);
10025   int i;
10026
10027   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10028   subsidiaries = make_uninit_vector (3);
10029   for (i = 0; i < 3; i++)
10030     {
10031       strcpy (buf + base_name_len, suffixes[i]);
10032       ASET (subsidiaries, i, intern (buf));
10033     }
10034   return subsidiaries;
10035 }
10036
10037
10038 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10039        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10040        doc: /* For internal use only.
10041 usage: (define-coding-system-internal ...)  */)
10042   (ptrdiff_t nargs, Lisp_Object *args)
10043 {
10044   Lisp_Object name;
10045   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10046   Lisp_Object attrs;            /* Vector of attributes.  */
10047   Lisp_Object eol_type;
10048   Lisp_Object aliases;
10049   Lisp_Object coding_type, charset_list, safe_charsets;
10050   enum coding_category category;
10051   Lisp_Object tail, val;
10052   int max_charset_id = 0;
10053   int i;
10054
10055   if (nargs < coding_arg_max)
10056     goto short_args;
10057
10058   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10059
10060   name = args[coding_arg_name];
10061   CHECK_SYMBOL (name);
10062   ASET (attrs, coding_attr_base_name, name);
10063
10064   val = args[coding_arg_mnemonic];
10065   if (! STRINGP (val))
10066     CHECK_CHARACTER (val);
10067   ASET (attrs, coding_attr_mnemonic, val);
10068
10069   coding_type = args[coding_arg_coding_type];
10070   CHECK_SYMBOL (coding_type);
10071   ASET (attrs, coding_attr_type, coding_type);
10072
10073   charset_list = args[coding_arg_charset_list];
10074   if (SYMBOLP (charset_list))
10075     {
10076       if (EQ (charset_list, Qiso_2022))
10077         {
10078           if (! EQ (coding_type, Qiso_2022))
10079             error ("Invalid charset-list");
10080           charset_list = Viso_2022_charset_list;
10081         }
10082       else if (EQ (charset_list, Qemacs_mule))
10083         {
10084           if (! EQ (coding_type, Qemacs_mule))
10085             error ("Invalid charset-list");
10086           charset_list = Vemacs_mule_charset_list;
10087         }
10088       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10089         {
10090           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10091             error ("Invalid charset-list");
10092           if (max_charset_id < XFASTINT (XCAR (tail)))
10093             max_charset_id = XFASTINT (XCAR (tail));
10094         }
10095     }
10096   else
10097     {
10098       charset_list = Fcopy_sequence (charset_list);
10099       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10100         {
10101           struct charset *charset;
10102
10103           val = XCAR (tail);
10104           CHECK_CHARSET_GET_CHARSET (val, charset);
10105           if (EQ (coding_type, Qiso_2022)
10106               ? CHARSET_ISO_FINAL (charset) < 0
10107               : EQ (coding_type, Qemacs_mule)
10108               ? CHARSET_EMACS_MULE_ID (charset) < 0
10109               : 0)
10110             error ("Can't handle charset `%s'",
10111                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10112
10113           XSETCAR (tail, make_number (charset->id));
10114           if (max_charset_id < charset->id)
10115             max_charset_id = charset->id;
10116         }
10117     }
10118   ASET (attrs, coding_attr_charset_list, charset_list);
10119
10120   safe_charsets = make_uninit_string (max_charset_id + 1);
10121   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10122   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10123     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10124   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10125
10126   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10127
10128   val = args[coding_arg_decode_translation_table];
10129   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10130     CHECK_SYMBOL (val);
10131   ASET (attrs, coding_attr_decode_tbl, val);
10132
10133   val = args[coding_arg_encode_translation_table];
10134   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10135     CHECK_SYMBOL (val);
10136   ASET (attrs, coding_attr_encode_tbl, val);
10137
10138   val = args[coding_arg_post_read_conversion];
10139   CHECK_SYMBOL (val);
10140   ASET (attrs, coding_attr_post_read, val);
10141
10142   val = args[coding_arg_pre_write_conversion];
10143   CHECK_SYMBOL (val);
10144   ASET (attrs, coding_attr_pre_write, val);
10145
10146   val = args[coding_arg_default_char];
10147   if (NILP (val))
10148     ASET (attrs, coding_attr_default_char, make_number (' '));
10149   else
10150     {
10151       CHECK_CHARACTER (val);
10152       ASET (attrs, coding_attr_default_char, val);
10153     }
10154
10155   val = args[coding_arg_for_unibyte];
10156   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10157
10158   val = args[coding_arg_plist];
10159   CHECK_LIST (val);
10160   ASET (attrs, coding_attr_plist, val);
10161
10162   if (EQ (coding_type, Qcharset))
10163     {
10164       /* Generate a lisp vector of 256 elements.  Each element is nil,
10165          integer, or a list of charset IDs.
10166
10167          If Nth element is nil, the byte code N is invalid in this
10168          coding system.
10169
10170          If Nth element is a number NUM, N is the first byte of a
10171          charset whose ID is NUM.
10172
10173          If Nth element is a list of charset IDs, N is the first byte
10174          of one of them.  The list is sorted by dimensions of the
10175          charsets.  A charset of smaller dimension comes first. */
10176       val = Fmake_vector (make_number (256), Qnil);
10177
10178       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10179         {
10180           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10181           int dim = CHARSET_DIMENSION (charset);
10182           int idx = (dim - 1) * 4;
10183
10184           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10185             ASET (attrs, coding_attr_ascii_compat, Qt);
10186
10187           for (i = charset->code_space[idx];
10188                i <= charset->code_space[idx + 1]; i++)
10189             {
10190               Lisp_Object tmp, tmp2;
10191               int dim2;
10192
10193               tmp = AREF (val, i);
10194               if (NILP (tmp))
10195                 tmp = XCAR (tail);
10196               else if (NUMBERP (tmp))
10197                 {
10198                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10199                   if (dim < dim2)
10200                     tmp = list2 (XCAR (tail), tmp);
10201                   else
10202                     tmp = list2 (tmp, XCAR (tail));
10203                 }
10204               else
10205                 {
10206                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10207                     {
10208                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10209                       if (dim < dim2)
10210                         break;
10211                     }
10212                   if (NILP (tmp2))
10213                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10214                   else
10215                     {
10216                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10217                       XSETCAR (tmp2, XCAR (tail));
10218                     }
10219                 }
10220               ASET (val, i, tmp);
10221             }
10222         }
10223       ASET (attrs, coding_attr_charset_valids, val);
10224       category = coding_category_charset;
10225     }
10226   else if (EQ (coding_type, Qccl))
10227     {
10228       Lisp_Object valids;
10229
10230       if (nargs < coding_arg_ccl_max)
10231         goto short_args;
10232
10233       val = args[coding_arg_ccl_decoder];
10234       CHECK_CCL_PROGRAM (val);
10235       if (VECTORP (val))
10236         val = Fcopy_sequence (val);
10237       ASET (attrs, coding_attr_ccl_decoder, val);
10238
10239       val = args[coding_arg_ccl_encoder];
10240       CHECK_CCL_PROGRAM (val);
10241       if (VECTORP (val))
10242         val = Fcopy_sequence (val);
10243       ASET (attrs, coding_attr_ccl_encoder, val);
10244
10245       val = args[coding_arg_ccl_valids];
10246       valids = Fmake_string (make_number (256), make_number (0));
10247       for (tail = val; CONSP (tail); tail = XCDR (tail))
10248         {
10249           int from, to;
10250
10251           val = XCAR (tail);
10252           if (INTEGERP (val))
10253             {
10254               if (! (0 <= XINT (val) && XINT (val) <= 255))
10255                 args_out_of_range_3 (val, make_number (0), make_number (255));
10256               from = to = XINT (val);
10257             }
10258           else
10259             {
10260               CHECK_CONS (val);
10261               CHECK_NATNUM_CAR (val);
10262               CHECK_NUMBER_CDR (val);
10263               if (XINT (XCAR (val)) > 255)
10264                 args_out_of_range_3 (XCAR (val),
10265                                      make_number (0), make_number (255));
10266               from = XINT (XCAR (val));
10267               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10268                 args_out_of_range_3 (XCDR (val),
10269                                      XCAR (val), make_number (255));
10270               to = XINT (XCDR (val));
10271             }
10272           for (i = from; i <= to; i++)
10273             SSET (valids, i, 1);
10274         }
10275       ASET (attrs, coding_attr_ccl_valids, valids);
10276
10277       category = coding_category_ccl;
10278     }
10279   else if (EQ (coding_type, Qutf_16))
10280     {
10281       Lisp_Object bom, endian;
10282
10283       ASET (attrs, coding_attr_ascii_compat, Qnil);
10284
10285       if (nargs < coding_arg_utf16_max)
10286         goto short_args;
10287
10288       bom = args[coding_arg_utf16_bom];
10289       if (! NILP (bom) && ! EQ (bom, Qt))
10290         {
10291           CHECK_CONS (bom);
10292           val = XCAR (bom);
10293           CHECK_CODING_SYSTEM (val);
10294           val = XCDR (bom);
10295           CHECK_CODING_SYSTEM (val);
10296         }
10297       ASET (attrs, coding_attr_utf_bom, bom);
10298
10299       endian = args[coding_arg_utf16_endian];
10300       CHECK_SYMBOL (endian);
10301       if (NILP (endian))
10302         endian = Qbig;
10303       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10304         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10305       ASET (attrs, coding_attr_utf_16_endian, endian);
10306
10307       category = (CONSP (bom)
10308                   ? coding_category_utf_16_auto
10309                   : NILP (bom)
10310                   ? (EQ (endian, Qbig)
10311                      ? coding_category_utf_16_be_nosig
10312                      : coding_category_utf_16_le_nosig)
10313                   : (EQ (endian, Qbig)
10314                      ? coding_category_utf_16_be
10315                      : coding_category_utf_16_le));
10316     }
10317   else if (EQ (coding_type, Qiso_2022))
10318     {
10319       Lisp_Object initial, reg_usage, request, flags;
10320
10321       if (nargs < coding_arg_iso2022_max)
10322         goto short_args;
10323
10324       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10325       CHECK_VECTOR (initial);
10326       for (i = 0; i < 4; i++)
10327         {
10328           val = AREF (initial, i);
10329           if (! NILP (val))
10330             {
10331               struct charset *charset;
10332
10333               CHECK_CHARSET_GET_CHARSET (val, charset);
10334               ASET (initial, i, make_number (CHARSET_ID (charset)));
10335               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10336                 ASET (attrs, coding_attr_ascii_compat, Qt);
10337             }
10338           else
10339             ASET (initial, i, make_number (-1));
10340         }
10341
10342       reg_usage = args[coding_arg_iso2022_reg_usage];
10343       CHECK_CONS (reg_usage);
10344       CHECK_NUMBER_CAR (reg_usage);
10345       CHECK_NUMBER_CDR (reg_usage);
10346
10347       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10348       for (tail = request; CONSP (tail); tail = XCDR (tail))
10349         {
10350           int id;
10351           Lisp_Object tmp1;
10352
10353           val = XCAR (tail);
10354           CHECK_CONS (val);
10355           tmp1 = XCAR (val);
10356           CHECK_CHARSET_GET_ID (tmp1, id);
10357           CHECK_NATNUM_CDR (val);
10358           if (XINT (XCDR (val)) >= 4)
10359             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10360           XSETCAR (val, make_number (id));
10361         }
10362
10363       flags = args[coding_arg_iso2022_flags];
10364       CHECK_NATNUM (flags);
10365       i = XINT (flags) & INT_MAX;
10366       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10367         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10368       flags = make_number (i);
10369
10370       ASET (attrs, coding_attr_iso_initial, initial);
10371       ASET (attrs, coding_attr_iso_usage, reg_usage);
10372       ASET (attrs, coding_attr_iso_request, request);
10373       ASET (attrs, coding_attr_iso_flags, flags);
10374       setup_iso_safe_charsets (attrs);
10375
10376       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10377         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10378                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10379                     ? coding_category_iso_7_else
10380                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10381                     ? coding_category_iso_7
10382                     : coding_category_iso_7_tight);
10383       else
10384         {
10385           int id = XINT (AREF (initial, 1));
10386
10387           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10388                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10389                        || id < 0)
10390                       ? coding_category_iso_8_else
10391                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10392                       ? coding_category_iso_8_1
10393                       : coding_category_iso_8_2);
10394         }
10395       if (category != coding_category_iso_8_1
10396           && category != coding_category_iso_8_2)
10397         ASET (attrs, coding_attr_ascii_compat, Qnil);
10398     }
10399   else if (EQ (coding_type, Qemacs_mule))
10400     {
10401       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10402         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10403       ASET (attrs, coding_attr_ascii_compat, Qt);
10404       category = coding_category_emacs_mule;
10405     }
10406   else if (EQ (coding_type, Qshift_jis))
10407     {
10408
10409       struct charset *charset;
10410
10411       if (XINT (Flength (charset_list)) != 3
10412           && XINT (Flength (charset_list)) != 4)
10413         error ("There should be three or four charsets");
10414
10415       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10416       if (CHARSET_DIMENSION (charset) != 1)
10417         error ("Dimension of charset %s is not one",
10418                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10419       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10420         ASET (attrs, coding_attr_ascii_compat, Qt);
10421
10422       charset_list = XCDR (charset_list);
10423       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10424       if (CHARSET_DIMENSION (charset) != 1)
10425         error ("Dimension of charset %s is not one",
10426                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10427
10428       charset_list = XCDR (charset_list);
10429       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10430       if (CHARSET_DIMENSION (charset) != 2)
10431         error ("Dimension of charset %s is not two",
10432                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10433
10434       charset_list = XCDR (charset_list);
10435       if (! NILP (charset_list))
10436         {
10437           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10438           if (CHARSET_DIMENSION (charset) != 2)
10439             error ("Dimension of charset %s is not two",
10440                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10441         }
10442
10443       category = coding_category_sjis;
10444       Vsjis_coding_system = name;
10445     }
10446   else if (EQ (coding_type, Qbig5))
10447     {
10448       struct charset *charset;
10449
10450       if (XINT (Flength (charset_list)) != 2)
10451         error ("There should be just two charsets");
10452
10453       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10454       if (CHARSET_DIMENSION (charset) != 1)
10455         error ("Dimension of charset %s is not one",
10456                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10457       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10458         ASET (attrs, coding_attr_ascii_compat, Qt);
10459
10460       charset_list = XCDR (charset_list);
10461       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10462       if (CHARSET_DIMENSION (charset) != 2)
10463         error ("Dimension of charset %s is not two",
10464                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10465
10466       category = coding_category_big5;
10467       Vbig5_coding_system = name;
10468     }
10469   else if (EQ (coding_type, Qraw_text))
10470     {
10471       category = coding_category_raw_text;
10472       ASET (attrs, coding_attr_ascii_compat, Qt);
10473     }
10474   else if (EQ (coding_type, Qutf_8))
10475     {
10476       Lisp_Object bom;
10477
10478       if (nargs < coding_arg_utf8_max)
10479         goto short_args;
10480
10481       bom = args[coding_arg_utf8_bom];
10482       if (! NILP (bom) && ! EQ (bom, Qt))
10483         {
10484           CHECK_CONS (bom);
10485           val = XCAR (bom);
10486           CHECK_CODING_SYSTEM (val);
10487           val = XCDR (bom);
10488           CHECK_CODING_SYSTEM (val);
10489         }
10490       ASET (attrs, coding_attr_utf_bom, bom);
10491       if (NILP (bom))
10492         ASET (attrs, coding_attr_ascii_compat, Qt);
10493
10494       category = (CONSP (bom) ? coding_category_utf_8_auto
10495                   : NILP (bom) ? coding_category_utf_8_nosig
10496                   : coding_category_utf_8_sig);
10497     }
10498   else if (EQ (coding_type, Qundecided))
10499     {
10500       if (nargs < coding_arg_undecided_max)
10501         goto short_args;
10502       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10503             args[coding_arg_undecided_inhibit_null_byte_detection]);
10504       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10505             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10506       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10507             args[coding_arg_undecided_prefer_utf_8]);
10508       category = coding_category_undecided;
10509     }
10510   else
10511     error ("Invalid coding system type: %s",
10512            SDATA (SYMBOL_NAME (coding_type)));
10513
10514   ASET (attrs, coding_attr_category, make_number (category));
10515   ASET (attrs, coding_attr_plist,
10516         Fcons (QCcategory,
10517                Fcons (AREF (Vcoding_category_table, category),
10518                       CODING_ATTR_PLIST (attrs))));
10519   ASET (attrs, coding_attr_plist,
10520         Fcons (QCascii_compatible_p,
10521                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10522                       CODING_ATTR_PLIST (attrs))));
10523
10524   eol_type = args[coding_arg_eol_type];
10525   if (! NILP (eol_type)
10526       && ! EQ (eol_type, Qunix)
10527       && ! EQ (eol_type, Qdos)
10528       && ! EQ (eol_type, Qmac))
10529     error ("Invalid eol-type");
10530
10531   aliases = list1 (name);
10532
10533   if (NILP (eol_type))
10534     {
10535       eol_type = make_subsidiaries (name);
10536       for (i = 0; i < 3; i++)
10537         {
10538           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10539
10540           this_name = AREF (eol_type, i);
10541           this_aliases = list1 (this_name);
10542           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10543           this_spec = make_uninit_vector (3);
10544           ASET (this_spec, 0, attrs);
10545           ASET (this_spec, 1, this_aliases);
10546           ASET (this_spec, 2, this_eol_type);
10547           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10548           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10549           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10550           if (NILP (val))
10551             Vcoding_system_alist
10552               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10553                        Vcoding_system_alist);
10554         }
10555     }
10556
10557   spec_vec = make_uninit_vector (3);
10558   ASET (spec_vec, 0, attrs);
10559   ASET (spec_vec, 1, aliases);
10560   ASET (spec_vec, 2, eol_type);
10561
10562   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10563   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10564   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10565   if (NILP (val))
10566     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10567                                   Vcoding_system_alist);
10568
10569   {
10570     int id = coding_categories[category].id;
10571
10572     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10573       setup_coding_system (name, &coding_categories[category]);
10574   }
10575
10576   return Qnil;
10577
10578  short_args:
10579   return Fsignal (Qwrong_number_of_arguments,
10580                   Fcons (intern ("define-coding-system-internal"),
10581                          make_number (nargs)));
10582 }
10583
10584
10585 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10586        3, 3, 0,
10587        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10588   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10589 {
10590   Lisp_Object spec, attrs;
10591
10592   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10593   attrs = AREF (spec, 0);
10594   if (EQ (prop, QCmnemonic))
10595     {
10596       if (! STRINGP (val))
10597         CHECK_CHARACTER (val);
10598       ASET (attrs, coding_attr_mnemonic, val);
10599     }
10600   else if (EQ (prop, QCdefault_char))
10601     {
10602       if (NILP (val))
10603         val = make_number (' ');
10604       else
10605         CHECK_CHARACTER (val);
10606       ASET (attrs, coding_attr_default_char, val);
10607     }
10608   else if (EQ (prop, QCdecode_translation_table))
10609     {
10610       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10611         CHECK_SYMBOL (val);
10612       ASET (attrs, coding_attr_decode_tbl, val);
10613     }
10614   else if (EQ (prop, QCencode_translation_table))
10615     {
10616       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10617         CHECK_SYMBOL (val);
10618       ASET (attrs, coding_attr_encode_tbl, val);
10619     }
10620   else if (EQ (prop, QCpost_read_conversion))
10621     {
10622       CHECK_SYMBOL (val);
10623       ASET (attrs, coding_attr_post_read, val);
10624     }
10625   else if (EQ (prop, QCpre_write_conversion))
10626     {
10627       CHECK_SYMBOL (val);
10628       ASET (attrs, coding_attr_pre_write, val);
10629     }
10630   else if (EQ (prop, QCascii_compatible_p))
10631     {
10632       ASET (attrs, coding_attr_ascii_compat, val);
10633     }
10634
10635   ASET (attrs, coding_attr_plist,
10636         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10637   return val;
10638 }
10639
10640
10641 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10642        Sdefine_coding_system_alias, 2, 2, 0,
10643        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10644   (Lisp_Object alias, Lisp_Object coding_system)
10645 {
10646   Lisp_Object spec, aliases, eol_type, val;
10647
10648   CHECK_SYMBOL (alias);
10649   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10650   aliases = AREF (spec, 1);
10651   /* ALIASES should be a list of length more than zero, and the first
10652      element is a base coding system.  Append ALIAS at the tail of the
10653      list.  */
10654   while (!NILP (XCDR (aliases)))
10655     aliases = XCDR (aliases);
10656   XSETCDR (aliases, list1 (alias));
10657
10658   eol_type = AREF (spec, 2);
10659   if (VECTORP (eol_type))
10660     {
10661       Lisp_Object subsidiaries;
10662       int i;
10663
10664       subsidiaries = make_subsidiaries (alias);
10665       for (i = 0; i < 3; i++)
10666         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10667                                      AREF (eol_type, i));
10668     }
10669
10670   Fputhash (alias, spec, Vcoding_system_hash_table);
10671   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10672   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10673   if (NILP (val))
10674     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10675                                   Vcoding_system_alist);
10676
10677   return Qnil;
10678 }
10679
10680 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10681        1, 1, 0,
10682        doc: /* Return the base of CODING-SYSTEM.
10683 Any alias or subsidiary coding system is not a base coding system.  */)
10684   (Lisp_Object coding_system)
10685 {
10686   Lisp_Object spec, attrs;
10687
10688   if (NILP (coding_system))
10689     return (Qno_conversion);
10690   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10691   attrs = AREF (spec, 0);
10692   return CODING_ATTR_BASE_NAME (attrs);
10693 }
10694
10695 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10696        1, 1, 0,
10697        doc: "Return the property list of CODING-SYSTEM.")
10698   (Lisp_Object coding_system)
10699 {
10700   Lisp_Object spec, attrs;
10701
10702   if (NILP (coding_system))
10703     coding_system = Qno_conversion;
10704   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10705   attrs = AREF (spec, 0);
10706   return CODING_ATTR_PLIST (attrs);
10707 }
10708
10709
10710 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10711        1, 1, 0,
10712        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10713   (Lisp_Object coding_system)
10714 {
10715   Lisp_Object spec;
10716
10717   if (NILP (coding_system))
10718     coding_system = Qno_conversion;
10719   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10720   return AREF (spec, 1);
10721 }
10722
10723 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10724        Scoding_system_eol_type, 1, 1, 0,
10725        doc: /* Return eol-type of CODING-SYSTEM.
10726 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10727
10728 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10729 and CR respectively.
10730
10731 A vector value indicates that a format of end-of-line should be
10732 detected automatically.  Nth element of the vector is the subsidiary
10733 coding system whose eol-type is N.  */)
10734   (Lisp_Object coding_system)
10735 {
10736   Lisp_Object spec, eol_type;
10737   int n;
10738
10739   if (NILP (coding_system))
10740     coding_system = Qno_conversion;
10741   if (! CODING_SYSTEM_P (coding_system))
10742     return Qnil;
10743   spec = CODING_SYSTEM_SPEC (coding_system);
10744   eol_type = AREF (spec, 2);
10745   if (VECTORP (eol_type))
10746     return Fcopy_sequence (eol_type);
10747   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10748   return make_number (n);
10749 }
10750
10751 #endif /* emacs */
10752
10753 \f
10754 /*** 9. Post-amble ***/
10755
10756 void
10757 init_coding_once (void)
10758 {
10759   int i;
10760
10761   for (i = 0; i < coding_category_max; i++)
10762     {
10763       coding_categories[i].id = -1;
10764       coding_priorities[i] = i;
10765     }
10766
10767   /* ISO2022 specific initialize routine.  */
10768   for (i = 0; i < 0x20; i++)
10769     iso_code_class[i] = ISO_control_0;
10770   for (i = 0x21; i < 0x7F; i++)
10771     iso_code_class[i] = ISO_graphic_plane_0;
10772   for (i = 0x80; i < 0xA0; i++)
10773     iso_code_class[i] = ISO_control_1;
10774   for (i = 0xA1; i < 0xFF; i++)
10775     iso_code_class[i] = ISO_graphic_plane_1;
10776   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10777   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10778   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10779   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10780   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10781   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10782   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10783   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10784   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10785
10786   for (i = 0; i < 256; i++)
10787     {
10788       emacs_mule_bytes[i] = 1;
10789     }
10790   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10791   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10792   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10793   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10794 }
10795
10796 #ifdef emacs
10797
10798 void
10799 syms_of_coding (void)
10800 {
10801 #include "coding.x"
10802
10803   staticpro (&Vcoding_system_hash_table);
10804   {
10805     Lisp_Object args[2];
10806     args[0] = QCtest;
10807     args[1] = Qeq;
10808     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10809   }
10810
10811   staticpro (&Vsjis_coding_system);
10812   Vsjis_coding_system = Qnil;
10813
10814   staticpro (&Vbig5_coding_system);
10815   Vbig5_coding_system = Qnil;
10816
10817   staticpro (&Vcode_conversion_reused_workbuf);
10818   Vcode_conversion_reused_workbuf = Qnil;
10819
10820   staticpro (&Vcode_conversion_workbuf_name);
10821   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10822
10823   reused_workbuf_in_use = 0;
10824
10825   DEFSYM (Qcharset, "charset");
10826   DEFSYM (Qtarget_idx, "target-idx");
10827   DEFSYM (Qcoding_system_history, "coding-system-history");
10828   Fset (Qcoding_system_history, Qnil);
10829
10830   /* Target FILENAME is the first argument.  */
10831   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10832   /* Target FILENAME is the third argument.  */
10833   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10834
10835   DEFSYM (Qcall_process, "call-process");
10836   /* Target PROGRAM is the first argument.  */
10837   Fput (Qcall_process, Qtarget_idx, make_number (0));
10838
10839   DEFSYM (Qcall_process_region, "call-process-region");
10840   /* Target PROGRAM is the third argument.  */
10841   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10842
10843   DEFSYM (Qstart_process, "start-process");
10844   /* Target PROGRAM is the third argument.  */
10845   Fput (Qstart_process, Qtarget_idx, make_number (2));
10846
10847   DEFSYM (Qopen_network_stream, "open-network-stream");
10848   /* Target SERVICE is the fourth argument.  */
10849   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10850
10851   DEFSYM (Qcoding_system, "coding-system");
10852   DEFSYM (Qcoding_aliases, "coding-aliases");
10853
10854   DEFSYM (Qeol_type, "eol-type");
10855   DEFSYM (Qunix, "unix");
10856   DEFSYM (Qdos, "dos");
10857   DEFSYM (Qmac, "mac");
10858
10859   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10860   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10861   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10862   DEFSYM (Qdefault_char, "default-char");
10863   DEFSYM (Qundecided, "undecided");
10864   DEFSYM (Qno_conversion, "no-conversion");
10865   DEFSYM (Qraw_text, "raw-text");
10866
10867   DEFSYM (Qiso_2022, "iso-2022");
10868
10869   DEFSYM (Qutf_8, "utf-8");
10870   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10871
10872 #if defined (WINDOWSNT) || defined (CYGWIN)
10873   /* No, not utf-16-le: that one has a BOM.  */
10874   DEFSYM (Qutf_16le, "utf-16le");
10875 #endif
10876
10877   DEFSYM (Qutf_16, "utf-16");
10878   DEFSYM (Qbig, "big");
10879   DEFSYM (Qlittle, "little");
10880
10881   DEFSYM (Qshift_jis, "shift-jis");
10882   DEFSYM (Qbig5, "big5");
10883
10884   DEFSYM (Qcoding_system_p, "coding-system-p");
10885
10886   DEFSYM (Qcoding_system_error, "coding-system-error");
10887   Fput (Qcoding_system_error, Qerror_conditions,
10888         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10889   Fput (Qcoding_system_error, Qerror_message,
10890         build_pure_c_string ("Invalid coding system"));
10891
10892   DEFSYM (Qtranslation_table, "translation-table");
10893   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10894   DEFSYM (Qtranslation_table_id, "translation-table-id");
10895   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10896   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10897
10898   DEFSYM (Qvalid_codes, "valid-codes");
10899
10900   DEFSYM (Qemacs_mule, "emacs-mule");
10901
10902   DEFSYM (QCcategory, ":category");
10903   DEFSYM (QCmnemonic, ":mnemonic");
10904   DEFSYM (QCdefault_char, ":default-char");
10905   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10906   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10907   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10908   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10909   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10910
10911   Vcoding_category_table
10912     = Fmake_vector (make_number (coding_category_max), Qnil);
10913   staticpro (&Vcoding_category_table);
10914   /* Followings are target of code detection.  */
10915   ASET (Vcoding_category_table, coding_category_iso_7,
10916         intern_c_string ("coding-category-iso-7"));
10917   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10918         intern_c_string ("coding-category-iso-7-tight"));
10919   ASET (Vcoding_category_table, coding_category_iso_8_1,
10920         intern_c_string ("coding-category-iso-8-1"));
10921   ASET (Vcoding_category_table, coding_category_iso_8_2,
10922         intern_c_string ("coding-category-iso-8-2"));
10923   ASET (Vcoding_category_table, coding_category_iso_7_else,
10924         intern_c_string ("coding-category-iso-7-else"));
10925   ASET (Vcoding_category_table, coding_category_iso_8_else,
10926         intern_c_string ("coding-category-iso-8-else"));
10927   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10928         intern_c_string ("coding-category-utf-8-auto"));
10929   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10930         intern_c_string ("coding-category-utf-8"));
10931   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10932         intern_c_string ("coding-category-utf-8-sig"));
10933   ASET (Vcoding_category_table, coding_category_utf_16_be,
10934         intern_c_string ("coding-category-utf-16-be"));
10935   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10936         intern_c_string ("coding-category-utf-16-auto"));
10937   ASET (Vcoding_category_table, coding_category_utf_16_le,
10938         intern_c_string ("coding-category-utf-16-le"));
10939   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10940         intern_c_string ("coding-category-utf-16-be-nosig"));
10941   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10942         intern_c_string ("coding-category-utf-16-le-nosig"));
10943   ASET (Vcoding_category_table, coding_category_charset,
10944         intern_c_string ("coding-category-charset"));
10945   ASET (Vcoding_category_table, coding_category_sjis,
10946         intern_c_string ("coding-category-sjis"));
10947   ASET (Vcoding_category_table, coding_category_big5,
10948         intern_c_string ("coding-category-big5"));
10949   ASET (Vcoding_category_table, coding_category_ccl,
10950         intern_c_string ("coding-category-ccl"));
10951   ASET (Vcoding_category_table, coding_category_emacs_mule,
10952         intern_c_string ("coding-category-emacs-mule"));
10953   /* Followings are NOT target of code detection.  */
10954   ASET (Vcoding_category_table, coding_category_raw_text,
10955         intern_c_string ("coding-category-raw-text"));
10956   ASET (Vcoding_category_table, coding_category_undecided,
10957         intern_c_string ("coding-category-undecided"));
10958
10959   DEFSYM (Qinsufficient_source, "insufficient-source");
10960   DEFSYM (Qinvalid_source, "invalid-source");
10961   DEFSYM (Qinterrupted, "interrupted");
10962   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10963
10964   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10965                doc: /* List of coding systems.
10966
10967 Do not alter the value of this variable manually.  This variable should be
10968 updated by the functions `define-coding-system' and
10969 `define-coding-system-alias'.  */);
10970   Vcoding_system_list = Qnil;
10971
10972   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10973                doc: /* Alist of coding system names.
10974 Each element is one element list of coding system name.
10975 This variable is given to `completing-read' as COLLECTION argument.
10976
10977 Do not alter the value of this variable manually.  This variable should be
10978 updated by the functions `make-coding-system' and
10979 `define-coding-system-alias'.  */);
10980   Vcoding_system_alist = Qnil;
10981
10982   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10983                doc: /* List of coding-categories (symbols) ordered by priority.
10984
10985 On detecting a coding system, Emacs tries code detection algorithms
10986 associated with each coding-category one by one in this order.  When
10987 one algorithm agrees with a byte sequence of source text, the coding
10988 system bound to the corresponding coding-category is selected.
10989
10990 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10991   {
10992     int i;
10993
10994     Vcoding_category_list = Qnil;
10995     for (i = coding_category_max - 1; i >= 0; i--)
10996       Vcoding_category_list
10997         = Fcons (AREF (Vcoding_category_table, i),
10998                  Vcoding_category_list);
10999   }
11000
11001   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11002                doc: /* Specify the coding system for read operations.
11003 It is useful to bind this variable with `let', but do not set it globally.
11004 If the value is a coding system, it is used for decoding on read operation.
11005 If not, an appropriate element is used from one of the coding system alists.
11006 There are three such tables: `file-coding-system-alist',
11007 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11008   Vcoding_system_for_read = Qnil;
11009
11010   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11011                doc: /* Specify the coding system for write operations.
11012 Programs bind this variable with `let', but you should not set it globally.
11013 If the value is a coding system, it is used for encoding of output,
11014 when writing it to a file and when sending it to a file or subprocess.
11015
11016 If this does not specify a coding system, an appropriate element
11017 is used from one of the coding system alists.
11018 There are three such tables: `file-coding-system-alist',
11019 `process-coding-system-alist', and `network-coding-system-alist'.
11020 For output to files, if the above procedure does not specify a coding system,
11021 the value of `buffer-file-coding-system' is used.  */);
11022   Vcoding_system_for_write = Qnil;
11023
11024   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11025                doc: /*
11026 Coding system used in the latest file or process I/O.  */);
11027   Vlast_coding_system_used = Qnil;
11028
11029   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11030                doc: /*
11031 Error status of the last code conversion.
11032
11033 When an error was detected in the last code conversion, this variable
11034 is set to one of the following symbols.
11035   `insufficient-source'
11036   `inconsistent-eol'
11037   `invalid-source'
11038   `interrupted'
11039   `insufficient-memory'
11040 When no error was detected, the value doesn't change.  So, to check
11041 the error status of a code conversion by this variable, you must
11042 explicitly set this variable to nil before performing code
11043 conversion.  */);
11044   Vlast_code_conversion_error = Qnil;
11045
11046   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11047                doc: /*
11048 *Non-nil means always inhibit code conversion of end-of-line format.
11049 See info node `Coding Systems' and info node `Text and Binary' concerning
11050 such conversion.  */);
11051   inhibit_eol_conversion = 0;
11052
11053   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11054                doc: /*
11055 Non-nil means process buffer inherits coding system of process output.
11056 Bind it to t if the process output is to be treated as if it were a file
11057 read from some filesystem.  */);
11058   inherit_process_coding_system = 0;
11059
11060   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11061                doc: /*
11062 Alist to decide a coding system to use for a file I/O operation.
11063 The format is ((PATTERN . VAL) ...),
11064 where PATTERN is a regular expression matching a file name,
11065 VAL is a coding system, a cons of coding systems, or a function symbol.
11066 If VAL is a coding system, it is used for both decoding and encoding
11067 the file contents.
11068 If VAL is a cons of coding systems, the car part is used for decoding,
11069 and the cdr part is used for encoding.
11070 If VAL is a function symbol, the function must return a coding system
11071 or a cons of coding systems which are used as above.  The function is
11072 called with an argument that is a list of the arguments with which
11073 `find-operation-coding-system' was called.  If the function can't decide
11074 a coding system, it can return `undecided' so that the normal
11075 code-detection is performed.
11076
11077 See also the function `find-operation-coding-system'
11078 and the variable `auto-coding-alist'.  */);
11079   Vfile_coding_system_alist = Qnil;
11080
11081   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11082                doc: /*
11083 Alist to decide a coding system to use for a process I/O operation.
11084 The format is ((PATTERN . VAL) ...),
11085 where PATTERN is a regular expression matching a program name,
11086 VAL is a coding system, a cons of coding systems, or a function symbol.
11087 If VAL is a coding system, it is used for both decoding what received
11088 from the program and encoding what sent to the program.
11089 If VAL is a cons of coding systems, the car part is used for decoding,
11090 and the cdr part is used for encoding.
11091 If VAL is a function symbol, the function must return a coding system
11092 or a cons of coding systems which are used as above.
11093
11094 See also the function `find-operation-coding-system'.  */);
11095   Vprocess_coding_system_alist = Qnil;
11096
11097   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11098                doc: /*
11099 Alist to decide a coding system to use for a network I/O operation.
11100 The format is ((PATTERN . VAL) ...),
11101 where PATTERN is a regular expression matching a network service name
11102 or is a port number to connect to,
11103 VAL is a coding system, a cons of coding systems, or a function symbol.
11104 If VAL is a coding system, it is used for both decoding what received
11105 from the network stream and encoding what sent to the network stream.
11106 If VAL is a cons of coding systems, the car part is used for decoding,
11107 and the cdr part is used for encoding.
11108 If VAL is a function symbol, the function must return a coding system
11109 or a cons of coding systems which are used as above.
11110
11111 See also the function `find-operation-coding-system'.  */);
11112   Vnetwork_coding_system_alist = Qnil;
11113
11114   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11115                doc: /* Coding system to use with system messages.
11116 Also used for decoding keyboard input on X Window system.  */);
11117   Vlocale_coding_system = Qnil;
11118
11119   /* The eol mnemonics are reset in startup.el system-dependently.  */
11120   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11121                doc: /*
11122 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11123   eol_mnemonic_unix = build_pure_c_string (":");
11124
11125   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11126                doc: /*
11127 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11128   eol_mnemonic_dos = build_pure_c_string ("\\");
11129
11130   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11131                doc: /*
11132 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11133   eol_mnemonic_mac = build_pure_c_string ("/");
11134
11135   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11136                doc: /*
11137 *String displayed in mode line when end-of-line format is not yet determined.  */);
11138   eol_mnemonic_undecided = build_pure_c_string (":");
11139
11140   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11141                doc: /*
11142 *Non-nil enables character translation while encoding and decoding.  */);
11143   Venable_character_translation = Qt;
11144
11145   DEFVAR_LISP ("standard-translation-table-for-decode",
11146                Vstandard_translation_table_for_decode,
11147                doc: /* Table for translating characters while decoding.  */);
11148   Vstandard_translation_table_for_decode = Qnil;
11149
11150   DEFVAR_LISP ("standard-translation-table-for-encode",
11151                Vstandard_translation_table_for_encode,
11152                doc: /* Table for translating characters while encoding.  */);
11153   Vstandard_translation_table_for_encode = Qnil;
11154
11155   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11156                doc: /* Alist of charsets vs revision numbers.
11157 While encoding, if a charset (car part of an element) is found,
11158 designate it with the escape sequence identifying revision (cdr part
11159 of the element).  */);
11160   Vcharset_revision_table = Qnil;
11161
11162   DEFVAR_LISP ("default-process-coding-system",
11163                Vdefault_process_coding_system,
11164                doc: /* Cons of coding systems used for process I/O by default.
11165 The car part is used for decoding a process output,
11166 the cdr part is used for encoding a text to be sent to a process.  */);
11167   Vdefault_process_coding_system = Qnil;
11168
11169   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11170                doc: /*
11171 Table of extra Latin codes in the range 128..159 (inclusive).
11172 This is a vector of length 256.
11173 If Nth element is non-nil, the existence of code N in a file
11174 \(or output of subprocess) doesn't prevent it to be detected as
11175 a coding system of ISO 2022 variant which has a flag
11176 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11177 or reading output of a subprocess.
11178 Only 128th through 159th elements have a meaning.  */);
11179   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11180
11181   DEFVAR_LISP ("select-safe-coding-system-function",
11182                Vselect_safe_coding_system_function,
11183                doc: /*
11184 Function to call to select safe coding system for encoding a text.
11185
11186 If set, this function is called to force a user to select a proper
11187 coding system which can encode the text in the case that a default
11188 coding system used in each operation can't encode the text.  The
11189 function should take care that the buffer is not modified while
11190 the coding system is being selected.
11191
11192 The default value is `select-safe-coding-system' (which see).  */);
11193   Vselect_safe_coding_system_function = Qnil;
11194
11195   DEFVAR_BOOL ("coding-system-require-warning",
11196                coding_system_require_warning,
11197                doc: /* Internal use only.
11198 If non-nil, on writing a file, `select-safe-coding-system-function' is
11199 called even if `coding-system-for-write' is non-nil.  The command
11200 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11201   coding_system_require_warning = 0;
11202
11203
11204   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11205                inhibit_iso_escape_detection,
11206                doc: /*
11207 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11208
11209 When Emacs reads text, it tries to detect how the text is encoded.
11210 This code detection is sensitive to escape sequences.  If Emacs sees
11211 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11212 of the ISO2022 encodings, and decodes text by the corresponding coding
11213 system (e.g. `iso-2022-7bit').
11214
11215 However, there may be a case that you want to read escape sequences in
11216 a file as is.  In such a case, you can set this variable to non-nil.
11217 Then the code detection will ignore any escape sequences, and no text is
11218 detected as encoded in some ISO-2022 encoding.  The result is that all
11219 escape sequences become visible in a buffer.
11220
11221 The default value is nil, and it is strongly recommended not to change
11222 it.  That is because many Emacs Lisp source files that contain
11223 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11224 in Emacs's distribution, and they won't be decoded correctly on
11225 reading if you suppress escape sequence detection.
11226
11227 The other way to read escape sequences in a file without decoding is
11228 to explicitly specify some coding system that doesn't use ISO-2022
11229 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11230   inhibit_iso_escape_detection = 0;
11231
11232   DEFVAR_BOOL ("inhibit-null-byte-detection",
11233                inhibit_null_byte_detection,
11234                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11235 By default, Emacs treats it as binary data, and does not attempt to
11236 decode it.  The effect is as if you specified `no-conversion' for
11237 reading that text.
11238
11239 Set this to non-nil when a regular text happens to include null bytes.
11240 Examples are Index nodes of Info files and null-byte delimited output
11241 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11242 decode text as usual.  */);
11243   inhibit_null_byte_detection = 0;
11244
11245   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11246                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11247 Internal use only.  Removed after the experimental optimizer gets stable. */);
11248   disable_ascii_optimization = 0;
11249
11250   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11251                doc: /* Char table for translating self-inserting characters.
11252 This is applied to the result of input methods, not their input.
11253 See also `keyboard-translate-table'.
11254
11255 Use of this variable for character code unification was rendered
11256 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11257 internal character representation.  */);
11258     Vtranslation_table_for_input = Qnil;
11259
11260   {
11261     Lisp_Object args[coding_arg_undecided_max];
11262     Lisp_Object plist[16];
11263     int i;
11264
11265     for (i = 0; i < coding_arg_undecided_max; i++)
11266       args[i] = Qnil;
11267
11268     plist[0] = intern_c_string (":name");
11269     plist[1] = args[coding_arg_name] = Qno_conversion;
11270     plist[2] = intern_c_string (":mnemonic");
11271     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11272     plist[4] = intern_c_string (":coding-type");
11273     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11274     plist[6] = intern_c_string (":ascii-compatible-p");
11275     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11276     plist[8] = intern_c_string (":default-char");
11277     plist[9] = args[coding_arg_default_char] = make_number (0);
11278     plist[10] = intern_c_string (":for-unibyte");
11279     plist[11] = args[coding_arg_for_unibyte] = Qt;
11280     plist[12] = intern_c_string (":docstring");
11281     plist[13] = build_pure_c_string ("Do no conversion.\n\
11282 \n\
11283 When you visit a file with this coding, the file is read into a\n\
11284 unibyte buffer as is, thus each byte of a file is treated as a\n\
11285 character.");
11286     plist[14] = intern_c_string (":eol-type");
11287     plist[15] = args[coding_arg_eol_type] = Qunix;
11288     args[coding_arg_plist] = Flist (16, plist);
11289     Fdefine_coding_system_internal (coding_arg_max, args);
11290
11291     plist[1] = args[coding_arg_name] = Qundecided;
11292     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11293     plist[5] = args[coding_arg_coding_type] = Qundecided;
11294     /* This is already set.
11295        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11296     plist[8] = intern_c_string (":charset-list");
11297     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11298     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11299     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11300     plist[15] = args[coding_arg_eol_type] = Qnil;
11301     args[coding_arg_plist] = Flist (16, plist);
11302     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11303     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11304     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11305   }
11306
11307   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11308
11309   {
11310     int i;
11311
11312     for (i = 0; i < coding_category_max; i++)
11313       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11314   }
11315 #if defined (DOS_NT)
11316   system_eol_type = Qdos;
11317 #else
11318   system_eol_type = Qunix;
11319 #endif
11320   staticpro (&system_eol_type);
11321 }
11322
11323 char *
11324 emacs_strerror (int error_number)
11325 {
11326   char *str;
11327
11328   synchronize_system_messages_locale ();
11329   str = strerror (error_number);
11330
11331   if (! NILP (Vlocale_coding_system))
11332     {
11333       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11334                                                       Vlocale_coding_system,
11335                                                       0);
11336       str = SSDATA (dec);
11337     }
11338
11339   return str;
11340 }
11341
11342 #endif /* emacs */