src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /*** Commonly used macros and functions ***/
 646
 647 #ifndef min
 648 #define min(a, b) ((a) < (b) ? (a) : (b))
 649 #endif
 650 #ifndef max
 651 #define max(a, b) ((a) > (b) ? (a) : (b))
 652 #endif
 653
 654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 655
 656 static int
 657 encode_inhibit_flag (Lisp_Object flag)
 658 {
 659   return NILP (flag) ? -1 : EQ (flag, Qt);
 660 }
 661
 662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 663    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 664
 665 static bool
 666 inhibit_flag (int encoded_flag, bool var)
 667 {
 668   return 0 < encoded_flag + var;
 669 }
 670
 671 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 672   do {                                                  \
 673     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 674     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 675   } while (0)
 676
 677 static void
 678 CHECK_NATNUM_CAR (Lisp_Object x)
 679 {
 680   Lisp_Object tmp = XCAR (x);
 681   CHECK_NATNUM (tmp);
 682   XSETCAR (x, tmp);
 683 }
 684
 685 static void
 686 CHECK_NATNUM_CDR (Lisp_Object x)
 687 {
 688   Lisp_Object tmp = XCDR (x);
 689   CHECK_NATNUM (tmp);
 690   XSETCDR (x, tmp);
 691 }
 692
 693
 694 /* Safely get one byte from the source text pointed by SRC which ends
 695    at SRC_END, and set C to that byte.  If there are not enough bytes
 696    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 697    and a multibyte character is found at SRC, set C to the
 698    negative value of the character code.  The caller should declare
 699    and set these variables appropriately in advance:
 700         src, src_end, multibytep */
 701
 702 #define ONE_MORE_BYTE(c)                                \
 703   do {                                                  \
 704     if (src == src_end)                                 \
 705       {                                                 \
 706         if (src_base < src)                             \
 707           record_conversion_result                      \
 708             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 709         goto no_more_source;                            \
 710       }                                                 \
 711     c = *src++;                                         \
 712     if (multibytep && (c & 0x80))                       \
 713       {                                                 \
 714         if ((c & 0xFE) == 0xC0)                         \
 715           c = ((c & 1) << 6) | *src++;                  \
 716         else                                            \
 717           {                                             \
 718             src--;                                      \
 719             c = - string_char (src, &src, NULL);        \
 720             record_conversion_result                    \
 721               (coding, CODING_RESULT_INVALID_SRC);      \
 722           }                                             \
 723       }                                                 \
 724     consumed_chars++;                                   \
 725   } while (0)
 726
 727 /* Safely get two bytes from the source text pointed by SRC which ends
 728    at SRC_END, and set C1 and C2 to those bytes while skipping the
 729    heading multibyte characters.  If there are not enough bytes in the
 730    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 731    a multibyte character is found for C2, set C2 to the negative value
 732    of the character code.  The caller should declare and set these
 733    variables appropriately in advance:
 734         src, src_end, multibytep
 735    It is intended that this macro is used in detect_coding_utf_16.  */
 736
 737 #define TWO_MORE_BYTES(c1, c2)                          \
 738   do {                                                  \
 739     do {                                                \
 740       if (src == src_end)                               \
 741         goto no_more_source;                            \
 742       c1 = *src++;                                      \
 743       if (multibytep && (c1 & 0x80))                    \
 744         {                                               \
 745           if ((c1 & 0xFE) == 0xC0)                      \
 746             c1 = ((c1 & 1) << 6) | *src++;              \
 747           else                                          \
 748             {                                           \
 749               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 750               c1 = -1;                                  \
 751             }                                           \
 752         }                                               \
 753     } while (c1 < 0);                                   \
 754     if (src == src_end)                                 \
 755       goto no_more_source;                              \
 756     c2 = *src++;                                        \
 757     if (multibytep && (c2 & 0x80))                      \
 758       {                                                 \
 759         if ((c2 & 0xFE) == 0xC0)                        \
 760           c2 = ((c2 & 1) << 6) | *src++;                \
 761         else                                            \
 762           c2 = -1;                                      \
 763       }                                                 \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  The caller should
 769    assure that C is 0..127, and declare and set the variable `dst'
 770    appropriately in advance.
 771 */
 772
 773
 774 #define EMIT_ONE_ASCII_BYTE(c)  \
 775   do {                          \
 776     produced_chars++;           \
 777     *dst++ = (c);               \
 778   } while (0)
 779
 780
 781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 782
 783 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 784   do {                                  \
 785     produced_chars += 2;                \
 786     *dst++ = (c1), *dst++ = (c2);       \
 787   } while (0)
 788
 789
 790 /* Store a byte C in the place pointed by DST and increment DST to the
 791    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 792    store in an appropriate multibyte form.  The caller should
 793    declare and set the variables `dst' and `multibytep' appropriately
 794    in advance.  */
 795
 796 #define EMIT_ONE_BYTE(c)                \
 797   do {                                  \
 798     produced_chars++;                   \
 799     if (multibytep)                     \
 800       {                                 \
 801         unsigned ch = (c);              \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       *dst++ = (c);                     \
 808   } while (0)
 809
 810
 811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 812
 813 #define EMIT_TWO_BYTES(c1, c2)          \
 814   do {                                  \
 815     produced_chars += 2;                \
 816     if (multibytep)                     \
 817       {                                 \
 818         unsigned ch;                    \
 819                                         \
 820         ch = (c1);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824         ch = (c2);                      \
 825         if (ch >= 0x80)                 \
 826           ch = BYTE8_TO_CHAR (ch);      \
 827         CHAR_STRING_ADVANCE (ch, dst);  \
 828       }                                 \
 829     else                                \
 830       {                                 \
 831         *dst++ = (c1);                  \
 832         *dst++ = (c2);                  \
 833       }                                 \
 834   } while (0)
 835
 836
 837 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 838   do {                                  \
 839     EMIT_ONE_BYTE (c1);                 \
 840     EMIT_TWO_BYTES (c2, c3);            \
 841   } while (0)
 842
 843
 844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 845   do {                                          \
 846     EMIT_TWO_BYTES (c1, c2);                    \
 847     EMIT_TWO_BYTES (c3, c4);                    \
 848   } while (0)
 849
 850
 851 static void
 852 record_conversion_result (struct coding_system *coding,
 853                           enum coding_result_code result)
 854 {
 855   coding->result = result;
 856   switch (result)
 857     {
 858     case CODING_RESULT_INSUFFICIENT_SRC:
 859       Vlast_code_conversion_error = Qinsufficient_source;
 860       break;
 861     case CODING_RESULT_INVALID_SRC:
 862       Vlast_code_conversion_error = Qinvalid_source;
 863       break;
 864     case CODING_RESULT_INTERRUPT:
 865       Vlast_code_conversion_error = Qinterrupted;
 866       break;
 867     case CODING_RESULT_INSUFFICIENT_DST:
 868       /* Don't record this error in Vlast_code_conversion_error
 869          because it happens just temporarily and is resolved when the
 870          whole conversion is finished.  */
 871       break;
 872     case CODING_RESULT_SUCCESS:
 873       break;
 874     default:
 875       Vlast_code_conversion_error = intern ("Unknown error");
 876     }
 877 }
 878
 879 /* These wrapper macros are used to preserve validity of pointers into
 880    buffer text across calls to decode_char, encode_char, etc, which
 881    could cause relocation of buffers if it loads a charset map,
 882    because loading a charset map allocates large structures.  */
 883
 884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 885   do {                                                                       \
 886     ptrdiff_t offset;                                                        \
 887                                                                              \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded                                                   \
 891         && (offset = coding_change_source (coding)))                         \
 892       {                                                                      \
 893         src += offset;                                                       \
 894         src_base += offset;                                                  \
 895         src_end += offset;                                                   \
 896       }                                                                      \
 897   } while (0)
 898
 899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 900   do {                                                                  \
 901     ptrdiff_t offset;                                                   \
 902                                                                         \
 903     charset_map_loaded = 0;                                             \
 904     code = ENCODE_CHAR (charset, c);                                    \
 905     if (charset_map_loaded                                              \
 906         && (offset = coding_change_destination (coding)))               \
 907       {                                                                 \
 908         dst += offset;                                                  \
 909         dst_end += offset;                                              \
 910       }                                                                 \
 911   } while (0)
 912
 913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 914   do {                                                                  \
 915     ptrdiff_t offset;                                                   \
 916                                                                         \
 917     charset_map_loaded = 0;                                             \
 918     charset = char_charset (c, charset_list, code_return);              \
 919     if (charset_map_loaded                                              \
 920         && (offset = coding_change_destination (coding)))               \
 921       {                                                                 \
 922         dst += offset;                                                  \
 923         dst_end += offset;                                              \
 924       }                                                                 \
 925   } while (0)
 926
 927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 928   do {                                                                  \
 929     ptrdiff_t offset;                                                   \
 930                                                                         \
 931     charset_map_loaded = 0;                                             \
 932     result = CHAR_CHARSET_P (c, charset);                               \
 933     if (charset_map_loaded                                              \
 934         && (offset = coding_change_destination (coding)))               \
 935       {                                                                 \
 936         dst += offset;                                                  \
 937         dst_end += offset;                                              \
 938       }                                                                 \
 939   } while (0)
 940
 941
 942 /* If there are at least BYTES length of room at dst, allocate memory
 943    for coding->destination and update dst and dst_end.  We don't have
 944    to take care of coding->source which will be relocated.  It is
 945    handled by calling coding_set_source in encode_coding.  */
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959 /* Store multibyte form of the character C in P, and advance P to the
 960    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 961    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 962    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 963
 964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 965
 966 /* Return the character code of character whose multibyte form is at
 967    P, and advance P to the end of the multibyte form.  This used to be
 968    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 969    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 970
 971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 972
 973 /* Set coding->source from coding->src_object.  */
 974
 975 static void
 976 coding_set_source (struct coding_system *coding)
 977 {
 978   if (BUFFERP (coding->src_object))
 979     {
 980       struct buffer *buf = XBUFFER (coding->src_object);
 981
 982       if (coding->src_pos < 0)
 983         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 984       else
 985         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 986     }
 987   else if (STRINGP (coding->src_object))
 988     {
 989       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 990     }
 991   else
 992     {
 993       /* Otherwise, the source is C string and is never relocated
 994          automatically.  Thus we don't have to update anything.  */
 995     }
 996 }
 997
 998
 999 /* Set coding->source from coding->src_object, and return how many
1000    bytes coding->source was changed.  */
1001
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1004 {
1005   const unsigned char *orig = coding->source;
1006   coding_set_source (coding);
1007   return coding->source - orig;
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object.  */
1012
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1015 {
1016   if (BUFFERP (coding->dst_object))
1017     {
1018       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019         {
1020           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021           coding->dst_bytes = (GAP_END_ADDR
1022                                - (coding->src_bytes - coding->consumed)
1023                                - coding->destination);
1024         }
1025       else
1026         {
1027           /* We are sure that coding->dst_pos_byte is before the gap
1028              of the buffer. */
1029           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030                                  + coding->dst_pos_byte - BEG_BYTE);
1031           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032                                - coding->destination);
1033         }
1034     }
1035   else
1036     {
1037       /* Otherwise, the destination is C string and is never relocated
1038          automatically.  Thus we don't have to update anything.  */
1039     }
1040 }
1041
1042
1043 /* Set coding->destination from coding->dst_object, and return how
1044    many bytes coding->destination was changed.  */
1045
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1048 {
1049   const unsigned char *orig = coding->destination;
1050   coding_set_destination (coding);
1051   return coding->destination - orig;
1052 }
1053
1054
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 {
1058   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059     string_overflow ();
1060   coding->destination = xrealloc (coding->destination,
1061                                   coding->dst_bytes + bytes);
1062   coding->dst_bytes += bytes;
1063 }
1064
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 {
1069   if (EQ (coding->src_object, coding->dst_object))
1070     {
1071       /* The gap may contain the produced data at the head and not-yet
1072          consumed data at the tail.  To preserve those data, we at
1073          first make the gap size to zero, then increase the gap
1074          size.  */
1075       ptrdiff_t add = GAP_SIZE;
1076
1077       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079       make_gap (bytes);
1080       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082     }
1083   else
1084     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1085 }
1086
1087
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090                    unsigned char *dst)
1091 {
1092   ptrdiff_t offset = dst - coding->destination;
1093
1094   if (BUFFERP (coding->dst_object))
1095     {
1096       struct buffer *buf = XBUFFER (coding->dst_object);
1097
1098       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099     }
1100   else
1101     coding_alloc_by_realloc (coding, nbytes);
1102   coding_set_destination (coding);
1103   dst = coding->destination + offset;
1104   return dst;
1105 }
1106
1107 /** Macros for annotations.  */
1108
1109 /* An annotation data is stored in the array coding->charbuf in this
1110    format:
1111      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112    LENGTH is the number of elements in the annotation.
1113    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114    NCHARS is the number of characters in the text annotated.
1115
1116    The format of the following elements depend on ANNOTATION_MASK.
1117
1118    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119    follows:
1120      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121
1122    NBYTES is the number of bytes specified in the header part of
1123    old-style emacs-mule encoding, or 0 for the other kind of
1124    composition.
1125
1126    METHOD is one of enum composition_method.
1127
1128    Optional COMPOSITION-COMPONENTS are characters and composition
1129    rules.
1130
1131    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132    follows.
1133
1134    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135    recover from an invalid annotation, and should be skipped by
1136    produce_annotation.  */
1137
1138 /* Maximum length of the header of annotation data.  */
1139 #define MAX_ANNOTATION_LENGTH 5
1140
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1142   do {                                                  \
1143     *(buf)++ = -(len);                                  \
1144     *(buf)++ = (mask);                                  \
1145     *(buf)++ = (nchars);                                \
1146     coding->annotated = 1;                              \
1147   } while (0);
1148
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1150   do {                                                                      \
1151     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152     *buf++ = nbytes;                                                        \
1153     *buf++ = method;                                                        \
1154   } while (0)
1155
1156
1157 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1158   do {                                                                  \
1159     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160     *buf++ = id;                                                        \
1161   } while (0)
1162
1163
1164 /* Bitmasks for coding->eol_seen.  */
1165
1166 #define EOL_SEEN_NONE   0
1167 #define EOL_SEEN_LF     1
1168 #define EOL_SEEN_CR     2
1169 #define EOL_SEEN_CRLF   4
1170
1171 \f
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1173
1174
1175
1176 \f
1177 /*** 3. UTF-8 ***/
1178
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180    Return true if a text is encoded in UTF-8.  */
1181
1182 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1192
1193 /* Unlike the other detect_coding_XXX, this function counts number of
1194    characters and check EOL format.  */
1195
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198                      struct coding_detection_info *detect_info)
1199 {
1200   const unsigned char *src = coding->source, *src_base;
1201   const unsigned char *src_end = coding->source + coding->src_bytes;
1202   bool multibytep = coding->src_multibyte;
1203   ptrdiff_t consumed_chars = 0;
1204   bool bom_found = 0;
1205   ptrdiff_t nchars = coding->head_ascii;
1206   int eol_seen = coding->eol_seen;
1207
1208   detect_info->checked |= CATEGORY_MASK_UTF_8;
1209   /* A coding system of this category is always ASCII compatible.  */
1210   src += nchars;
1211
1212   if (src == coding->source     /* BOM should be at the head.  */
1213       && src + 3 < src_end      /* BOM is 3-byte long.  */
1214       && src[0] == UTF_8_BOM_1
1215       && src[1] == UTF_8_BOM_2
1216       && src[2] == UTF_8_BOM_3)
1217     {
1218       bom_found = 1;
1219       src += 3;
1220       nchars++;
1221     }
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4;
1226
1227       src_base = src;
1228       ONE_MORE_BYTE (c);
1229       if (c < 0 || UTF_8_1_OCTET_P (c))
1230         {
1231           nchars++;
1232           if (c == '\r')
1233             {
1234               if (src < src_end && *src == '\n')
1235                 {
1236                   eol_seen |= EOL_SEEN_CRLF;
1237                   src++;
1238                   nchars++;
1239                 }
1240               else
1241                 eol_seen |= EOL_SEEN_CR;
1242             }
1243           else if (c == '\n')
1244             eol_seen |= EOL_SEEN_LF;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c1);
1248       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249         break;
1250       if (UTF_8_2_OCTET_LEADING_P (c))
1251         {
1252           nchars++;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c2);
1256       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257         break;
1258       if (UTF_8_3_OCTET_LEADING_P (c))
1259         {
1260           nchars++;
1261           continue;
1262         }
1263       ONE_MORE_BYTE (c3);
1264       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265         break;
1266       if (UTF_8_4_OCTET_LEADING_P (c))
1267         {
1268           nchars++;
1269           continue;
1270         }
1271       ONE_MORE_BYTE (c4);
1272       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273         break;
1274       if (UTF_8_5_OCTET_LEADING_P (c))
1275         {
1276           nchars++;
1277           continue;
1278         }
1279       break;
1280     }
1281   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282   return 0;
1283
1284  no_more_source:
1285   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286     {
1287       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288       return 0;
1289     }
1290   if (bom_found)
1291     {
1292       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1293       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294     }
1295   else
1296     {
1297       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298       if (nchars < src_end - coding->source)
1299         /* The found characters are less than source bytes, which
1300            means that we found a valid non-ASCII characters.  */
1301         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302     }
1303   coding->detected_utf8_bytes = src_base - coding->source;
1304   coding->detected_utf8_chars = nchars;
1305   return 1;
1306 }
1307
1308
1309 static void
1310 decode_coding_utf_8 (struct coding_system *coding)
1311 {
1312   const unsigned char *src = coding->source + coding->consumed;
1313   const unsigned char *src_end = coding->source + coding->src_bytes;
1314   const unsigned char *src_base;
1315   int *charbuf = coding->charbuf + coding->charbuf_used;
1316   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1317   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1318   bool multibytep = coding->src_multibyte;
1319   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1320   bool eol_dos
1321     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1322   int byte_after_cr = -1;
1323
1324   if (bom != utf_without_bom)
1325     {
1326       int c1, c2, c3;
1327
1328       src_base = src;
1329       ONE_MORE_BYTE (c1);
1330       if (! UTF_8_3_OCTET_LEADING_P (c1))
1331         src = src_base;
1332       else
1333         {
1334           ONE_MORE_BYTE (c2);
1335           if (! UTF_8_EXTRA_OCTET_P (c2))
1336             src = src_base;
1337           else
1338             {
1339               ONE_MORE_BYTE (c3);
1340               if (! UTF_8_EXTRA_OCTET_P (c3))
1341                 src = src_base;
1342               else
1343                 {
1344                   if ((c1 != UTF_8_BOM_1)
1345                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1346                     src = src_base;
1347                   else
1348                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1349                 }
1350             }
1351         }
1352     }
1353   CODING_UTF_8_BOM (coding) = utf_without_bom;
1354
1355   while (1)
1356     {
1357       int c, c1, c2, c3, c4, c5;
1358
1359       src_base = src;
1360       consumed_chars_base = consumed_chars;
1361
1362       if (charbuf >= charbuf_end)
1363         {
1364           if (byte_after_cr >= 0)
1365             src_base--;
1366           break;
1367         }
1368
1369       /* In the simple case, rapidly handle ordinary characters */
1370       if (multibytep && ! eol_dos
1371           && charbuf < charbuf_end - 6 && src < src_end - 6)
1372         {
1373           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1374             {
1375               c1 = *src;
1376               if (c1 & 0x80)
1377                 break;
1378               src++;
1379               consumed_chars++;
1380               *charbuf++ = c1;
1381
1382               c1 = *src;
1383               if (c1 & 0x80)
1384                 break;
1385               src++;
1386               consumed_chars++;
1387               *charbuf++ = c1;
1388
1389               c1 = *src;
1390               if (c1 & 0x80)
1391                 break;
1392               src++;
1393               consumed_chars++;
1394               *charbuf++ = c1;
1395
1396               c1 = *src;
1397               if (c1 & 0x80)
1398                 break;
1399               src++;
1400               consumed_chars++;
1401               *charbuf++ = c1;
1402             }
1403           /* If we handled at least one character, restart the main loop.  */
1404           if (src != src_base)
1405             continue;
1406         }
1407
1408       if (byte_after_cr >= 0)
1409         c1 = byte_after_cr, byte_after_cr = -1;
1410       else
1411         ONE_MORE_BYTE (c1);
1412       if (c1 < 0)
1413         {
1414           c = - c1;
1415         }
1416       else if (UTF_8_1_OCTET_P (c1))
1417         {
1418           if (eol_dos && c1 == '\r')
1419             ONE_MORE_BYTE (byte_after_cr);
1420           c = c1;
1421         }
1422       else
1423         {
1424           ONE_MORE_BYTE (c2);
1425           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1426             goto invalid_code;
1427           if (UTF_8_2_OCTET_LEADING_P (c1))
1428             {
1429               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1430               /* Reject overlong sequences here and below.  Encoders
1431                  producing them are incorrect, they can be misleading,
1432                  and they mess up read/write invariance.  */
1433               if (c < 128)
1434                 goto invalid_code;
1435             }
1436           else
1437             {
1438               ONE_MORE_BYTE (c3);
1439               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1440                 goto invalid_code;
1441               if (UTF_8_3_OCTET_LEADING_P (c1))
1442                 {
1443                   c = (((c1 & 0xF) << 12)
1444                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1445                   if (c < 0x800
1446                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1447                     goto invalid_code;
1448                 }
1449               else
1450                 {
1451                   ONE_MORE_BYTE (c4);
1452                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1453                     goto invalid_code;
1454                   if (UTF_8_4_OCTET_LEADING_P (c1))
1455                     {
1456                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1457                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1458                     if (c < 0x10000)
1459                       goto invalid_code;
1460                     }
1461                   else
1462                     {
1463                       ONE_MORE_BYTE (c5);
1464                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1465                         goto invalid_code;
1466                       if (UTF_8_5_OCTET_LEADING_P (c1))
1467                         {
1468                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1469                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1470                                | (c5 & 0x3F));
1471                           if ((c > MAX_CHAR) || (c < 0x200000))
1472                             goto invalid_code;
1473                         }
1474                       else
1475                         goto invalid_code;
1476                     }
1477                 }
1478             }
1479         }
1480
1481       *charbuf++ = c;
1482       continue;
1483
1484     invalid_code:
1485       src = src_base;
1486       consumed_chars = consumed_chars_base;
1487       ONE_MORE_BYTE (c);
1488       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1489       coding->errors++;
1490     }
1491
1492  no_more_source:
1493   coding->consumed_char += consumed_chars_base;
1494   coding->consumed = src_base - coding->source;
1495   coding->charbuf_used = charbuf - coding->charbuf;
1496 }
1497
1498
1499 static bool
1500 encode_coding_utf_8 (struct coding_system *coding)
1501 {
1502   bool multibytep = coding->dst_multibyte;
1503   int *charbuf = coding->charbuf;
1504   int *charbuf_end = charbuf + coding->charbuf_used;
1505   unsigned char *dst = coding->destination + coding->produced;
1506   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1507   ptrdiff_t produced_chars = 0;
1508   int c;
1509
1510   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1511     {
1512       ASSURE_DESTINATION (3);
1513       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1514       CODING_UTF_8_BOM (coding) = utf_without_bom;
1515     }
1516
1517   if (multibytep)
1518     {
1519       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1520
1521       while (charbuf < charbuf_end)
1522         {
1523           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1524
1525           ASSURE_DESTINATION (safe_room);
1526           c = *charbuf++;
1527           if (CHAR_BYTE8_P (c))
1528             {
1529               c = CHAR_TO_BYTE8 (c);
1530               EMIT_ONE_BYTE (c);
1531             }
1532           else
1533             {
1534               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1535               for (p = str; p < pend; p++)
1536                 EMIT_ONE_BYTE (*p);
1537             }
1538         }
1539     }
1540   else
1541     {
1542       int safe_room = MAX_MULTIBYTE_LENGTH;
1543
1544       while (charbuf < charbuf_end)
1545         {
1546           ASSURE_DESTINATION (safe_room);
1547           c = *charbuf++;
1548           if (CHAR_BYTE8_P (c))
1549             *dst++ = CHAR_TO_BYTE8 (c);
1550           else
1551             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1552         }
1553       produced_chars = dst - (coding->destination + coding->produced);
1554     }
1555   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1556   coding->produced_char += produced_chars;
1557   coding->produced = dst - coding->destination;
1558   return 0;
1559 }
1560
1561
1562 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1563    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1564
1565 #define UTF_16_HIGH_SURROGATE_P(val) \
1566   (((val) & 0xFC00) == 0xD800)
1567
1568 #define UTF_16_LOW_SURROGATE_P(val) \
1569   (((val) & 0xFC00) == 0xDC00)
1570
1571
1572 static bool
1573 detect_coding_utf_16 (struct coding_system *coding,
1574                       struct coding_detection_info *detect_info)
1575 {
1576   const unsigned char *src = coding->source;
1577   const unsigned char *src_end = coding->source + coding->src_bytes;
1578   bool multibytep = coding->src_multibyte;
1579   int c1, c2;
1580
1581   detect_info->checked |= CATEGORY_MASK_UTF_16;
1582   if (coding->mode & CODING_MODE_LAST_BLOCK
1583       && (coding->src_chars & 1))
1584     {
1585       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1586       return 0;
1587     }
1588
1589   TWO_MORE_BYTES (c1, c2);
1590   if ((c1 == 0xFF) && (c2 == 0xFE))
1591     {
1592       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1593                              | CATEGORY_MASK_UTF_16_AUTO);
1594       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1595                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1596                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1597     }
1598   else if ((c1 == 0xFE) && (c2 == 0xFF))
1599     {
1600       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1601                              | CATEGORY_MASK_UTF_16_AUTO);
1602       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1603                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1604                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1605     }
1606   else if (c2 < 0)
1607     {
1608       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1609       return 0;
1610     }
1611   else
1612     {
1613       /* We check the dispersion of Eth and Oth bytes where E is even and
1614          O is odd.  If both are high, we assume binary data.*/
1615       unsigned char e[256], o[256];
1616       unsigned e_num = 1, o_num = 1;
1617
1618       memset (e, 0, 256);
1619       memset (o, 0, 256);
1620       e[c1] = 1;
1621       o[c2] = 1;
1622
1623       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1624                                 |CATEGORY_MASK_UTF_16_BE
1625                                 | CATEGORY_MASK_UTF_16_LE);
1626
1627       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1628              != CATEGORY_MASK_UTF_16)
1629         {
1630           TWO_MORE_BYTES (c1, c2);
1631           if (c2 < 0)
1632             break;
1633           if (! e[c1])
1634             {
1635               e[c1] = 1;
1636               e_num++;
1637               if (e_num >= 128)
1638                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1639             }
1640           if (! o[c2])
1641             {
1642               o[c2] = 1;
1643               o_num++;
1644               if (o_num >= 128)
1645                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1646             }
1647         }
1648       return 0;
1649     }
1650
1651  no_more_source:
1652   return 1;
1653 }
1654
1655 static void
1656 decode_coding_utf_16 (struct coding_system *coding)
1657 {
1658   const unsigned char *src = coding->source + coding->consumed;
1659   const unsigned char *src_end = coding->source + coding->src_bytes;
1660   const unsigned char *src_base;
1661   int *charbuf = coding->charbuf + coding->charbuf_used;
1662   /* We may produces at most 3 chars in one loop.  */
1663   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1664   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1665   bool multibytep = coding->src_multibyte;
1666   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1667   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1668   int surrogate = CODING_UTF_16_SURROGATE (coding);
1669   bool eol_dos
1670     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1671   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1672
1673   if (bom == utf_with_bom)
1674     {
1675       int c, c1, c2;
1676
1677       src_base = src;
1678       ONE_MORE_BYTE (c1);
1679       ONE_MORE_BYTE (c2);
1680       c = (c1 << 8) | c2;
1681
1682       if (endian == utf_16_big_endian
1683           ? c != 0xFEFF : c != 0xFFFE)
1684         {
1685           /* The first two bytes are not BOM.  Treat them as bytes
1686              for a normal character.  */
1687           src = src_base;
1688           coding->errors++;
1689         }
1690       CODING_UTF_16_BOM (coding) = utf_without_bom;
1691     }
1692   else if (bom == utf_detect_bom)
1693     {
1694       /* We have already tried to detect BOM and failed in
1695          detect_coding.  */
1696       CODING_UTF_16_BOM (coding) = utf_without_bom;
1697     }
1698
1699   while (1)
1700     {
1701       int c, c1, c2;
1702
1703       src_base = src;
1704       consumed_chars_base = consumed_chars;
1705
1706       if (charbuf >= charbuf_end)
1707         {
1708           if (byte_after_cr1 >= 0)
1709             src_base -= 2;
1710           break;
1711         }
1712
1713       if (byte_after_cr1 >= 0)
1714         c1 = byte_after_cr1, byte_after_cr1 = -1;
1715       else
1716         ONE_MORE_BYTE (c1);
1717       if (c1 < 0)
1718         {
1719           *charbuf++ = -c1;
1720           continue;
1721         }
1722       if (byte_after_cr2 >= 0)
1723         c2 = byte_after_cr2, byte_after_cr2 = -1;
1724       else
1725         ONE_MORE_BYTE (c2);
1726       if (c2 < 0)
1727         {
1728           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1729           *charbuf++ = -c2;
1730           continue;
1731         }
1732       c = (endian == utf_16_big_endian
1733            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1734
1735       if (surrogate)
1736         {
1737           if (! UTF_16_LOW_SURROGATE_P (c))
1738             {
1739               if (endian == utf_16_big_endian)
1740                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1741               else
1742                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1743               *charbuf++ = c1;
1744               *charbuf++ = c2;
1745               coding->errors++;
1746               if (UTF_16_HIGH_SURROGATE_P (c))
1747                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1748               else
1749                 *charbuf++ = c;
1750             }
1751           else
1752             {
1753               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1754               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1755               *charbuf++ = 0x10000 + c;
1756             }
1757         }
1758       else
1759         {
1760           if (UTF_16_HIGH_SURROGATE_P (c))
1761             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1762           else
1763             {
1764               if (eol_dos && c == '\r')
1765                 {
1766                   ONE_MORE_BYTE (byte_after_cr1);
1767                   ONE_MORE_BYTE (byte_after_cr2);
1768                 }
1769               *charbuf++ = c;
1770             }
1771         }
1772     }
1773
1774  no_more_source:
1775   coding->consumed_char += consumed_chars_base;
1776   coding->consumed = src_base - coding->source;
1777   coding->charbuf_used = charbuf - coding->charbuf;
1778 }
1779
1780 static bool
1781 encode_coding_utf_16 (struct coding_system *coding)
1782 {
1783   bool multibytep = coding->dst_multibyte;
1784   int *charbuf = coding->charbuf;
1785   int *charbuf_end = charbuf + coding->charbuf_used;
1786   unsigned char *dst = coding->destination + coding->produced;
1787   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1788   int safe_room = 8;
1789   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1790   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1791   ptrdiff_t produced_chars = 0;
1792   int c;
1793
1794   if (bom != utf_without_bom)
1795     {
1796       ASSURE_DESTINATION (safe_room);
1797       if (big_endian)
1798         EMIT_TWO_BYTES (0xFE, 0xFF);
1799       else
1800         EMIT_TWO_BYTES (0xFF, 0xFE);
1801       CODING_UTF_16_BOM (coding) = utf_without_bom;
1802     }
1803
1804   while (charbuf < charbuf_end)
1805     {
1806       ASSURE_DESTINATION (safe_room);
1807       c = *charbuf++;
1808       if (c > MAX_UNICODE_CHAR)
1809         c = coding->default_char;
1810
1811       if (c < 0x10000)
1812         {
1813           if (big_endian)
1814             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1815           else
1816             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1817         }
1818       else
1819         {
1820           int c1, c2;
1821
1822           c -= 0x10000;
1823           c1 = (c >> 10) + 0xD800;
1824           c2 = (c & 0x3FF) + 0xDC00;
1825           if (big_endian)
1826             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1827           else
1828             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1829         }
1830     }
1831   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1832   coding->produced = dst - coding->destination;
1833   coding->produced_char += produced_chars;
1834   return 0;
1835 }
1836
1837 \f
1838 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1839
1840 /* Emacs' internal format for representation of multiple character
1841    sets is a kind of multi-byte encoding, i.e. characters are
1842    represented by variable-length sequences of one-byte codes.
1843
1844    ASCII characters and control characters (e.g. `tab', `newline') are
1845    represented by one-byte sequences which are their ASCII codes, in
1846    the range 0x00 through 0x7F.
1847
1848    8-bit characters of the range 0x80..0x9F are represented by
1849    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1850    code + 0x20).
1851
1852    8-bit characters of the range 0xA0..0xFF are represented by
1853    one-byte sequences which are their 8-bit code.
1854
1855    The other characters are represented by a sequence of `base
1856    leading-code', optional `extended leading-code', and one or two
1857    `position-code's.  The length of the sequence is determined by the
1858    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1859    whereas extended leading-code and position-code take the range 0xA0
1860    through 0xFF.  See `charset.h' for more details about leading-code
1861    and position-code.
1862
1863    --- CODE RANGE of Emacs' internal format ---
1864    character set        range
1865    -------------        -----
1866    ascii                0x00..0x7F
1867    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1868    eight-bit-graphic    0xA0..0xBF
1869    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1870    ---------------------------------------------
1871
1872    As this is the internal character representation, the format is
1873    usually not used externally (i.e. in a file or in a data sent to a
1874    process).  But, it is possible to have a text externally in this
1875    format (i.e. by encoding by the coding system `emacs-mule').
1876
1877    In that case, a sequence of one-byte codes has a slightly different
1878    form.
1879
1880    At first, all characters in eight-bit-control are represented by
1881    one-byte sequences which are their 8-bit code.
1882
1883    Next, character composition data are represented by the byte
1884    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1885    where,
1886         METHOD is 0xF2 plus one of composition method (enum
1887         composition_method),
1888
1889         BYTES is 0xA0 plus a byte length of this composition data,
1890
1891         CHARS is 0xA0 plus a number of characters composed by this
1892         data,
1893
1894         COMPONENTs are characters of multibyte form or composition
1895         rules encoded by two-byte of ASCII codes.
1896
1897    In addition, for backward compatibility, the following formats are
1898    also recognized as composition data on decoding.
1899
1900    0x80 MSEQ ...
1901    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1902
1903    Here,
1904         MSEQ is a multibyte form but in these special format:
1905           ASCII: 0xA0 ASCII_CODE+0x80,
1906           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1907         RULE is a one byte code of the range 0xA0..0xF0 that
1908         represents a composition rule.
1909   */
1910
1911 char emacs_mule_bytes[256];
1912
1913
1914 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1915    Return true if a text is encoded in 'emacs-mule'.  */
1916
1917 static bool
1918 detect_coding_emacs_mule (struct coding_system *coding,
1919                           struct coding_detection_info *detect_info)
1920 {
1921   const unsigned char *src = coding->source, *src_base;
1922   const unsigned char *src_end = coding->source + coding->src_bytes;
1923   bool multibytep = coding->src_multibyte;
1924   ptrdiff_t consumed_chars = 0;
1925   int c;
1926   int found = 0;
1927
1928   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1929   /* A coding system of this category is always ASCII compatible.  */
1930   src += coding->head_ascii;
1931
1932   while (1)
1933     {
1934       src_base = src;
1935       ONE_MORE_BYTE (c);
1936       if (c < 0)
1937         continue;
1938       if (c == 0x80)
1939         {
1940           /* Perhaps the start of composite character.  We simply skip
1941              it because analyzing it is too heavy for detecting.  But,
1942              at least, we check that the composite character
1943              constitutes of more than 4 bytes.  */
1944           const unsigned char *src_start;
1945
1946         repeat:
1947           src_start = src;
1948           do
1949             {
1950               ONE_MORE_BYTE (c);
1951             }
1952           while (c >= 0xA0);
1953
1954           if (src - src_start <= 4)
1955             break;
1956           found = CATEGORY_MASK_EMACS_MULE;
1957           if (c == 0x80)
1958             goto repeat;
1959         }
1960
1961       if (c < 0x80)
1962         {
1963           if (c < 0x20
1964               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1965             break;
1966         }
1967       else
1968         {
1969           int more_bytes = emacs_mule_bytes[c] - 1;
1970
1971           while (more_bytes > 0)
1972             {
1973               ONE_MORE_BYTE (c);
1974               if (c < 0xA0)
1975                 {
1976                   src--;        /* Unread the last byte.  */
1977                   break;
1978                 }
1979               more_bytes--;
1980             }
1981           if (more_bytes != 0)
1982             break;
1983           found = CATEGORY_MASK_EMACS_MULE;
1984         }
1985     }
1986   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1987   return 0;
1988
1989  no_more_source:
1990   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1991     {
1992       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1993       return 0;
1994     }
1995   detect_info->found |= found;
1996   return 1;
1997 }
1998
1999
2000 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2001    character.  If CMP_STATUS indicates that we must expect MSEQ or
2002    RULE described above, decode it and return the negative value of
2003    the decoded character or rule.  If an invalid byte is found, return
2004    -1.  If SRC is too short, return -2.  */
2005
2006 static int
2007 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2008                  int *nbytes, int *nchars, int *id,
2009                  struct composition_status *cmp_status)
2010 {
2011   const unsigned char *src_end = coding->source + coding->src_bytes;
2012   const unsigned char *src_base = src;
2013   bool multibytep = coding->src_multibyte;
2014   int charset_ID;
2015   unsigned code;
2016   int c;
2017   ptrdiff_t consumed_chars = 0;
2018   bool mseq_found = 0;
2019
2020   ONE_MORE_BYTE (c);
2021   if (c < 0)
2022     {
2023       c = -c;
2024       charset_ID = emacs_mule_charset[0];
2025     }
2026   else
2027     {
2028       if (c >= 0xA0)
2029         {
2030           if (cmp_status->state != COMPOSING_NO
2031               && cmp_status->old_form)
2032             {
2033               if (cmp_status->state == COMPOSING_CHAR)
2034                 {
2035                   if (c == 0xA0)
2036                     {
2037                       ONE_MORE_BYTE (c);
2038                       c -= 0x80;
2039                       if (c < 0)
2040                         goto invalid_code;
2041                     }
2042                   else
2043                     c -= 0x20;
2044                   mseq_found = 1;
2045                 }
2046               else
2047                 {
2048                   *nbytes = src - src_base;
2049                   *nchars = consumed_chars;
2050                   return -c;
2051                 }
2052             }
2053           else
2054             goto invalid_code;
2055         }
2056
2057       switch (emacs_mule_bytes[c])
2058         {
2059         case 2:
2060           if ((charset_ID = emacs_mule_charset[c]) < 0)
2061             goto invalid_code;
2062           ONE_MORE_BYTE (c);
2063           if (c < 0xA0)
2064             goto invalid_code;
2065           code = c & 0x7F;
2066           break;
2067
2068         case 3:
2069           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2070               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2071             {
2072               ONE_MORE_BYTE (c);
2073               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2074                 goto invalid_code;
2075               ONE_MORE_BYTE (c);
2076               if (c < 0xA0)
2077                 goto invalid_code;
2078               code = c & 0x7F;
2079             }
2080           else
2081             {
2082               if ((charset_ID = emacs_mule_charset[c]) < 0)
2083                 goto invalid_code;
2084               ONE_MORE_BYTE (c);
2085               if (c < 0xA0)
2086                 goto invalid_code;
2087               code = (c & 0x7F) << 8;
2088               ONE_MORE_BYTE (c);
2089               if (c < 0xA0)
2090                 goto invalid_code;
2091               code |= c & 0x7F;
2092             }
2093           break;
2094
2095         case 4:
2096           ONE_MORE_BYTE (c);
2097           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2098             goto invalid_code;
2099           ONE_MORE_BYTE (c);
2100           if (c < 0xA0)
2101             goto invalid_code;
2102           code = (c & 0x7F) << 8;
2103           ONE_MORE_BYTE (c);
2104           if (c < 0xA0)
2105             goto invalid_code;
2106           code |= c & 0x7F;
2107           break;
2108
2109         case 1:
2110           code = c;
2111           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2112           break;
2113
2114         default:
2115           emacs_abort ();
2116         }
2117       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2118                           CHARSET_FROM_ID (charset_ID), code, c);
2119       if (c < 0)
2120         goto invalid_code;
2121     }
2122   *nbytes = src - src_base;
2123   *nchars = consumed_chars;
2124   if (id)
2125     *id = charset_ID;
2126   return (mseq_found ? -c : c);
2127
2128  no_more_source:
2129   return -2;
2130
2131  invalid_code:
2132   return -1;
2133 }
2134
2135
2136 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2137
2138 /* Handle these composition sequence ('|': the end of header elements,
2139    BYTES and CHARS >= 0xA0):
2140
2141    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2142    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2143    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2144
2145    and these old form:
2146
2147    (4) relative composition: 0x80 | MSEQ ... MSEQ
2148    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2149
2150    When the starter 0x80 and the following header elements are found,
2151    this annotation header is produced.
2152
2153         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2154
2155    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2156    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157
2158    Then, upon reading the following elements, these codes are produced
2159    until the composition end is found:
2160
2161    (1) CHAR ... CHAR
2162    (2) ALT ... ALT CHAR ... CHAR
2163    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2164    (4) CHAR ... CHAR
2165    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2166
2167    When the composition end is found, LENGTH and NCHARS in the
2168    annotation header is updated as below:
2169
2170    (1) LENGTH: unchanged, NCHARS: unchanged
2171    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2172    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2174    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2175
2176    If an error is found while composing, the annotation header is
2177    changed to the original composition header (plus filler -1s) as
2178    below:
2179
2180    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2181    (5)          [ 0x80 0xFF -1 -1- -1 ]
2182
2183    and the sequence [ -2 DECODED-RULE ] is changed to the original
2184    byte sequence as below:
2185         o the original byte sequence is B: [ B -1 ]
2186         o the original byte sequence is B1 B2: [ B1 B2 ]
2187
2188    Most of the routines are implemented by macros because many
2189    variables and labels in the caller decode_coding_emacs_mule must be
2190    accessible, and they are usually called just once (thus doesn't
2191    increase the size of compiled object).  */
2192
2193 /* Decode a composition rule represented by C as a component of
2194    composition sequence of Emacs 20 style.  Set RULE to the decoded
2195    rule. */
2196
2197 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2198   do {                                                  \
2199     int gref, nref;                                     \
2200                                                         \
2201     c -= 0xA0;                                          \
2202     if (c < 0 || c >= 81)                               \
2203       goto invalid_code;                                \
2204     gref = c / 9, nref = c % 9;                         \
2205     if (gref == 4) gref = 10;                           \
2206     if (nref == 4) nref = 10;                           \
2207     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2208   } while (0)
2209
2210
2211 /* Decode a composition rule represented by C and the following byte
2212    at SRC as a component of composition sequence of Emacs 21 style.
2213    Set RULE to the decoded rule.  */
2214
2215 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2216   do {                                                  \
2217     int gref, nref;                                     \
2218                                                         \
2219     gref = c - 0x20;                                    \
2220     if (gref < 0 || gref >= 81)                         \
2221       goto invalid_code;                                \
2222     ONE_MORE_BYTE (c);                                  \
2223     nref = c - 0x20;                                    \
2224     if (nref < 0 || nref >= 81)                         \
2225       goto invalid_code;                                \
2226     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2227   } while (0)
2228
2229
2230 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2231    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2232    byte length of this composition information, CHARS is the number of
2233    characters composed by this composition.  */
2234
2235 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2236   do {                                                                  \
2237     enum composition_method method = c - 0xF2;                          \
2238     int nbytes, nchars;                                                 \
2239                                                                         \
2240     ONE_MORE_BYTE (c);                                                  \
2241     if (c < 0)                                                          \
2242       goto invalid_code;                                                \
2243     nbytes = c - 0xA0;                                                  \
2244     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2245       goto invalid_code;                                                \
2246     ONE_MORE_BYTE (c);                                                  \
2247     nchars = c - 0xA0;                                                  \
2248     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2249       goto invalid_code;                                                \
2250     cmp_status->old_form = 0;                                           \
2251     cmp_status->method = method;                                        \
2252     if (method == COMPOSITION_RELATIVE)                                 \
2253       cmp_status->state = COMPOSING_CHAR;                               \
2254     else                                                                \
2255       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2256     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2257     cmp_status->nchars = nchars;                                        \
2258     cmp_status->ncomps = nbytes - 4;                                    \
2259     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2260   } while (0)
2261
2262
2263 /* Start of Emacs 20 style format for relative composition.  */
2264
2265 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2266   do {                                                          \
2267     cmp_status->old_form = 1;                                   \
2268     cmp_status->method = COMPOSITION_RELATIVE;                  \
2269     cmp_status->state = COMPOSING_CHAR;                         \
2270     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2271     cmp_status->nchars = cmp_status->ncomps = 0;                \
2272     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2273   } while (0)
2274
2275
2276 /* Start of Emacs 20 style format for rule-base composition.  */
2277
2278 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2279   do {                                                          \
2280     cmp_status->old_form = 1;                                   \
2281     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2282     cmp_status->state = COMPOSING_CHAR;                         \
2283     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2284     cmp_status->nchars = cmp_status->ncomps = 0;                \
2285     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2286   } while (0)
2287
2288
2289 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2290   do {                                                  \
2291     const unsigned char *current_src = src;             \
2292                                                         \
2293     ONE_MORE_BYTE (c);                                  \
2294     if (c < 0)                                          \
2295       goto invalid_code;                                \
2296     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2297         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2298       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2299     else if (c < 0xA0)                                  \
2300       goto invalid_code;                                \
2301     else if (c < 0xC0)                                  \
2302       {                                                 \
2303         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2304         /* Re-read C as a composition component.  */    \
2305         src = current_src;                              \
2306       }                                                 \
2307     else if (c == 0xFF)                                 \
2308       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2309     else                                                \
2310       goto invalid_code;                                \
2311   } while (0)
2312
2313 #define EMACS_MULE_COMPOSITION_END()                            \
2314   do {                                                          \
2315     int idx = - cmp_status->length;                             \
2316                                                                 \
2317     if (cmp_status->old_form)                                   \
2318       charbuf[idx + 2] = cmp_status->nchars;                    \
2319     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2320       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2321     cmp_status->state = COMPOSING_NO;                           \
2322   } while (0)
2323
2324
2325 static int
2326 emacs_mule_finish_composition (int *charbuf,
2327                                struct composition_status *cmp_status)
2328 {
2329   int idx = - cmp_status->length;
2330   int new_chars;
2331
2332   if (cmp_status->old_form && cmp_status->nchars > 0)
2333     {
2334       charbuf[idx + 2] = cmp_status->nchars;
2335       new_chars = 0;
2336       if (cmp_status->method == COMPOSITION_WITH_RULE
2337           && cmp_status->state == COMPOSING_CHAR)
2338         {
2339           /* The last rule was invalid.  */
2340           int rule = charbuf[-1] + 0xA0;
2341
2342           charbuf[-2] = BYTE8_TO_CHAR (rule);
2343           charbuf[-1] = -1;
2344           new_chars = 1;
2345         }
2346     }
2347   else
2348     {
2349       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2350
2351       if (cmp_status->method == COMPOSITION_WITH_RULE)
2352         {
2353           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2354           charbuf[idx++] = -3;
2355           charbuf[idx++] = 0;
2356           new_chars = 1;
2357         }
2358       else
2359         {
2360           int nchars = charbuf[idx + 1] + 0xA0;
2361           int nbytes = charbuf[idx + 2] + 0xA0;
2362
2363           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2364           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2365           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2366           charbuf[idx++] = -1;
2367           new_chars = 4;
2368         }
2369     }
2370   cmp_status->state = COMPOSING_NO;
2371   return new_chars;
2372 }
2373
2374 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2375   do {                                                                    \
2376     if (cmp_status->state != COMPOSING_NO)                                \
2377       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2378   } while (0)
2379
2380
2381 static void
2382 decode_coding_emacs_mule (struct coding_system *coding)
2383 {
2384   const unsigned char *src = coding->source + coding->consumed;
2385   const unsigned char *src_end = coding->source + coding->src_bytes;
2386   const unsigned char *src_base;
2387   int *charbuf = coding->charbuf + coding->charbuf_used;
2388   /* We may produce two annotations (charset and composition) in one
2389      loop and one more charset annotation at the end.  */
2390   int *charbuf_end
2391     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2392       /* We can produce up to 2 characters in a loop.  */
2393       - 1;
2394   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2395   bool multibytep = coding->src_multibyte;
2396   ptrdiff_t char_offset = coding->produced_char;
2397   ptrdiff_t last_offset = char_offset;
2398   int last_id = charset_ascii;
2399   bool eol_dos
2400     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2401   int byte_after_cr = -1;
2402   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2403
2404   if (cmp_status->state != COMPOSING_NO)
2405     {
2406       int i;
2407
2408       if (charbuf_end - charbuf < cmp_status->length)
2409         emacs_abort ();
2410       for (i = 0; i < cmp_status->length; i++)
2411         *charbuf++ = cmp_status->carryover[i];
2412       coding->annotated = 1;
2413     }
2414
2415   while (1)
2416     {
2417       int c, id IF_LINT (= 0);
2418
2419       src_base = src;
2420       consumed_chars_base = consumed_chars;
2421
2422       if (charbuf >= charbuf_end)
2423         {
2424           if (byte_after_cr >= 0)
2425             src_base--;
2426           break;
2427         }
2428
2429       if (byte_after_cr >= 0)
2430         c = byte_after_cr, byte_after_cr = -1;
2431       else
2432         ONE_MORE_BYTE (c);
2433
2434       if (c < 0 || c == 0x80)
2435         {
2436           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2437           if (c < 0)
2438             {
2439               *charbuf++ = -c;
2440               char_offset++;
2441             }
2442           else
2443             DECODE_EMACS_MULE_COMPOSITION_START ();
2444           continue;
2445         }
2446
2447       if (c < 0x80)
2448         {
2449           if (eol_dos && c == '\r')
2450             ONE_MORE_BYTE (byte_after_cr);
2451           id = charset_ascii;
2452           if (cmp_status->state != COMPOSING_NO)
2453             {
2454               if (cmp_status->old_form)
2455                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2457                 cmp_status->ncomps--;
2458             }
2459         }
2460       else
2461         {
2462           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2463           /* emacs_mule_char can load a charset map from a file, which
2464              allocates a large structure and might cause buffer text
2465              to be relocated as result.  Thus, we need to remember the
2466              original pointer to buffer text, and fix up all related
2467              pointers after the call.  */
2468           const unsigned char *orig = coding->source;
2469           ptrdiff_t offset;
2470
2471           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2472                                cmp_status);
2473           offset = coding->source - orig;
2474           if (offset)
2475             {
2476               src += offset;
2477               src_base += offset;
2478               src_end += offset;
2479             }
2480           if (c < 0)
2481             {
2482               if (c == -1)
2483                 goto invalid_code;
2484               if (c == -2)
2485                 break;
2486             }
2487           src = src_base + nbytes;
2488           consumed_chars = consumed_chars_base + nchars;
2489           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2490             cmp_status->ncomps -= nchars;
2491         }
2492
2493       /* Now if C >= 0, we found a normally encoded character, if C <
2494          0, we found an old-style composition component character or
2495          rule.  */
2496
2497       if (cmp_status->state == COMPOSING_NO)
2498         {
2499           if (last_id != id)
2500             {
2501               if (last_id != charset_ascii)
2502                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2503                                   last_id);
2504               last_id = id;
2505               last_offset = char_offset;
2506             }
2507           *charbuf++ = c;
2508           char_offset++;
2509         }
2510       else if (cmp_status->state == COMPOSING_CHAR)
2511         {
2512           if (cmp_status->old_form)
2513             {
2514               if (c >= 0)
2515                 {
2516                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2517                   *charbuf++ = c;
2518                   char_offset++;
2519                 }
2520               else
2521                 {
2522                   *charbuf++ = -c;
2523                   cmp_status->nchars++;
2524                   cmp_status->length++;
2525                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2526                     EMACS_MULE_COMPOSITION_END ();
2527                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2528                     cmp_status->state = COMPOSING_RULE;
2529                 }
2530             }
2531           else
2532             {
2533               *charbuf++ = c;
2534               cmp_status->length++;
2535               cmp_status->nchars--;
2536               if (cmp_status->nchars == 0)
2537                 EMACS_MULE_COMPOSITION_END ();
2538             }
2539         }
2540       else if (cmp_status->state == COMPOSING_RULE)
2541         {
2542           int rule;
2543
2544           if (c >= 0)
2545             {
2546               EMACS_MULE_COMPOSITION_END ();
2547               *charbuf++ = c;
2548               char_offset++;
2549             }
2550           else
2551             {
2552               c = -c;
2553               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2554               if (rule < 0)
2555                 goto invalid_code;
2556               *charbuf++ = -2;
2557               *charbuf++ = rule;
2558               cmp_status->length += 2;
2559               cmp_status->state = COMPOSING_CHAR;
2560             }
2561         }
2562       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2563         {
2564           *charbuf++ = c;
2565           cmp_status->length++;
2566           if (cmp_status->ncomps == 0)
2567             cmp_status->state = COMPOSING_CHAR;
2568           else if (cmp_status->ncomps > 0)
2569             {
2570               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2571                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2572             }
2573           else
2574             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2575         }
2576       else                      /* COMPOSING_COMPONENT_RULE */
2577         {
2578           int rule;
2579
2580           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2581           if (rule < 0)
2582             goto invalid_code;
2583           *charbuf++ = -2;
2584           *charbuf++ = rule;
2585           cmp_status->length += 2;
2586           cmp_status->ncomps--;
2587           if (cmp_status->ncomps > 0)
2588             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2589           else
2590             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2591         }
2592       continue;
2593
2594     invalid_code:
2595       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2596       src = src_base;
2597       consumed_chars = consumed_chars_base;
2598       ONE_MORE_BYTE (c);
2599       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2600       char_offset++;
2601       coding->errors++;
2602     }
2603
2604  no_more_source:
2605   if (cmp_status->state != COMPOSING_NO)
2606     {
2607       if (coding->mode & CODING_MODE_LAST_BLOCK)
2608         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2609       else
2610         {
2611           int i;
2612
2613           charbuf -= cmp_status->length;
2614           for (i = 0; i < cmp_status->length; i++)
2615             cmp_status->carryover[i] = charbuf[i];
2616         }
2617     }
2618   if (last_id != charset_ascii)
2619     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2620   coding->consumed_char += consumed_chars_base;
2621   coding->consumed = src_base - coding->source;
2622   coding->charbuf_used = charbuf - coding->charbuf;
2623 }
2624
2625
2626 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2627   do {                                          \
2628     if (id < 0xA0)                              \
2629       codes[0] = id, codes[1] = 0;              \
2630     else if (id < 0xE0)                         \
2631       codes[0] = 0x9A, codes[1] = id;           \
2632     else if (id < 0xF0)                         \
2633       codes[0] = 0x9B, codes[1] = id;           \
2634     else if (id < 0xF5)                         \
2635       codes[0] = 0x9C, codes[1] = id;           \
2636     else                                        \
2637       codes[0] = 0x9D, codes[1] = id;           \
2638   } while (0);
2639
2640
2641 static bool
2642 encode_coding_emacs_mule (struct coding_system *coding)
2643 {
2644   bool multibytep = coding->dst_multibyte;
2645   int *charbuf = coding->charbuf;
2646   int *charbuf_end = charbuf + coding->charbuf_used;
2647   unsigned char *dst = coding->destination + coding->produced;
2648   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2649   int safe_room = 8;
2650   ptrdiff_t produced_chars = 0;
2651   Lisp_Object attrs, charset_list;
2652   int c;
2653   int preferred_charset_id = -1;
2654
2655   CODING_GET_INFO (coding, attrs, charset_list);
2656   if (! EQ (charset_list, Vemacs_mule_charset_list))
2657     {
2658       charset_list = Vemacs_mule_charset_list;
2659       ASET (attrs, coding_attr_charset_list, charset_list);
2660     }
2661
2662   while (charbuf < charbuf_end)
2663     {
2664       ASSURE_DESTINATION (safe_room);
2665       c = *charbuf++;
2666
2667       if (c < 0)
2668         {
2669           /* Handle an annotation.  */
2670           switch (*charbuf)
2671             {
2672             case CODING_ANNOTATE_COMPOSITION_MASK:
2673               /* Not yet implemented.  */
2674               break;
2675             case CODING_ANNOTATE_CHARSET_MASK:
2676               preferred_charset_id = charbuf[3];
2677               if (preferred_charset_id >= 0
2678                   && NILP (Fmemq (make_number (preferred_charset_id),
2679                                   charset_list)))
2680                 preferred_charset_id = -1;
2681               break;
2682             default:
2683               emacs_abort ();
2684             }
2685           charbuf += -c - 1;
2686           continue;
2687         }
2688
2689       if (ASCII_CHAR_P (c))
2690         EMIT_ONE_ASCII_BYTE (c);
2691       else if (CHAR_BYTE8_P (c))
2692         {
2693           c = CHAR_TO_BYTE8 (c);
2694           EMIT_ONE_BYTE (c);
2695         }
2696       else
2697         {
2698           struct charset *charset;
2699           unsigned code;
2700           int dimension;
2701           int emacs_mule_id;
2702           unsigned char leading_codes[2];
2703
2704           if (preferred_charset_id >= 0)
2705             {
2706               bool result;
2707
2708               charset = CHARSET_FROM_ID (preferred_charset_id);
2709               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2710               if (result)
2711                 code = ENCODE_CHAR (charset, c);
2712               else
2713                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2714                                      &code, charset);
2715             }
2716           else
2717             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2718                                  &code, charset);
2719           if (! charset)
2720             {
2721               c = coding->default_char;
2722               if (ASCII_CHAR_P (c))
2723                 {
2724                   EMIT_ONE_ASCII_BYTE (c);
2725                   continue;
2726                 }
2727               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2728                                    &code, charset);
2729             }
2730           dimension = CHARSET_DIMENSION (charset);
2731           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2732           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2733           EMIT_ONE_BYTE (leading_codes[0]);
2734           if (leading_codes[1])
2735             EMIT_ONE_BYTE (leading_codes[1]);
2736           if (dimension == 1)
2737             EMIT_ONE_BYTE (code | 0x80);
2738           else
2739             {
2740               code |= 0x8080;
2741               EMIT_ONE_BYTE (code >> 8);
2742               EMIT_ONE_BYTE (code & 0xFF);
2743             }
2744         }
2745     }
2746   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2747   coding->produced_char += produced_chars;
2748   coding->produced = dst - coding->destination;
2749   return 0;
2750 }
2751
2752 \f
2753 /*** 7. ISO2022 handlers ***/
2754
2755 /* The following note describes the coding system ISO2022 briefly.
2756    Since the intention of this note is to help understand the
2757    functions in this file, some parts are NOT ACCURATE or are OVERLY
2758    SIMPLIFIED.  For thorough understanding, please refer to the
2759    original document of ISO2022.  This is equivalent to the standard
2760    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2761
2762    ISO2022 provides many mechanisms to encode several character sets
2763    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2764    is encoded using bytes less than 128.  This may make the encoded
2765    text a little bit longer, but the text passes more easily through
2766    several types of gateway, some of which strip off the MSB (Most
2767    Significant Bit).
2768
2769    There are two kinds of character sets: control character sets and
2770    graphic character sets.  The former contain control characters such
2771    as `newline' and `escape' to provide control functions (control
2772    functions are also provided by escape sequences).  The latter
2773    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2774    two control character sets and many graphic character sets.
2775
2776    Graphic character sets are classified into one of the following
2777    four classes, according to the number of bytes (DIMENSION) and
2778    number of characters in one dimension (CHARS) of the set:
2779    - DIMENSION1_CHARS94
2780    - DIMENSION1_CHARS96
2781    - DIMENSION2_CHARS94
2782    - DIMENSION2_CHARS96
2783
2784    In addition, each character set is assigned an identification tag,
2785    unique for each set, called the "final character" (denoted as <F>
2786    hereafter).  The <F> of each character set is decided by ECMA(*)
2787    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2788    (0x30..0x3F are for private use only).
2789
2790    Note (*): ECMA = European Computer Manufacturers Association
2791
2792    Here are examples of graphic character sets [NAME(<F>)]:
2793         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2794         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2795         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2796         o DIMENSION2_CHARS96 -- none for the moment
2797
2798    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2799         C0 [0x00..0x1F] -- control character plane 0
2800         GL [0x20..0x7F] -- graphic character plane 0
2801         C1 [0x80..0x9F] -- control character plane 1
2802         GR [0xA0..0xFF] -- graphic character plane 1
2803
2804    A control character set is directly designated and invoked to C0 or
2805    C1 by an escape sequence.  The most common case is that:
2806    - ISO646's  control character set is designated/invoked to C0, and
2807    - ISO6429's control character set is designated/invoked to C1,
2808    and usually these designations/invocations are omitted in encoded
2809    text.  In a 7-bit environment, only C0 can be used, and a control
2810    character for C1 is encoded by an appropriate escape sequence to
2811    fit into the environment.  All control characters for C1 are
2812    defined to have corresponding escape sequences.
2813
2814    A graphic character set is at first designated to one of four
2815    graphic registers (G0 through G3), then these graphic registers are
2816    invoked to GL or GR.  These designations and invocations can be
2817    done independently.  The most common case is that G0 is invoked to
2818    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2819    these invocations and designations are omitted in encoded text.
2820    In a 7-bit environment, only GL can be used.
2821
2822    When a graphic character set of CHARS94 is invoked to GL, codes
2823    0x20 and 0x7F of the GL area work as control characters SPACE and
2824    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2825    be used.
2826
2827    There are two ways of invocation: locking-shift and single-shift.
2828    With locking-shift, the invocation lasts until the next different
2829    invocation, whereas with single-shift, the invocation affects the
2830    following character only and doesn't affect the locking-shift
2831    state.  Invocations are done by the following control characters or
2832    escape sequences:
2833
2834    ----------------------------------------------------------------------
2835    abbrev  function                  cntrl escape seq   description
2836    ----------------------------------------------------------------------
2837    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2838    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2839    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2840    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2841    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2842    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2843    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2844    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2845    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2846    ----------------------------------------------------------------------
2847    (*) These are not used by any known coding system.
2848
2849    Control characters for these functions are defined by macros
2850    ISO_CODE_XXX in `coding.h'.
2851
2852    Designations are done by the following escape sequences:
2853    ----------------------------------------------------------------------
2854    escape sequence      description
2855    ----------------------------------------------------------------------
2856    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2857    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2858    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2859    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2860    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2861    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2862    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2863    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2864    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2865    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2866    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2867    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2868    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2869    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2870    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2871    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2872    ----------------------------------------------------------------------
2873
2874    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2875    of dimension 1, chars 94, and final character <F>, etc...
2876
2877    Note (*): Although these designations are not allowed in ISO2022,
2878    Emacs accepts them on decoding, and produces them on encoding
2879    CHARS96 character sets in a coding system which is characterized as
2880    7-bit environment, non-locking-shift, and non-single-shift.
2881
2882    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2883    '(' must be omitted.  We refer to this as "short-form" hereafter.
2884
2885    Now you may notice that there are a lot of ways of encoding the
2886    same multilingual text in ISO2022.  Actually, there exist many
2887    coding systems such as Compound Text (used in X11's inter client
2888    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2889    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2890    localized platforms), and all of these are variants of ISO2022.
2891
2892    In addition to the above, Emacs handles two more kinds of escape
2893    sequences: ISO6429's direction specification and Emacs' private
2894    sequence for specifying character composition.
2895
2896    ISO6429's direction specification takes the following form:
2897         o CSI ']'      -- end of the current direction
2898         o CSI '0' ']'  -- end of the current direction
2899         o CSI '1' ']'  -- start of left-to-right text
2900         o CSI '2' ']'  -- start of right-to-left text
2901    The control character CSI (0x9B: control sequence introducer) is
2902    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2903
2904    Character composition specification takes the following form:
2905         o ESC '0' -- start relative composition
2906         o ESC '1' -- end composition
2907         o ESC '2' -- start rule-base composition (*)
2908         o ESC '3' -- start relative composition with alternate chars  (**)
2909         o ESC '4' -- start rule-base composition with alternate chars  (**)
2910   Since these are not standard escape sequences of any ISO standard,
2911   the use of them with these meanings is restricted to Emacs only.
2912
2913   (*) This form is used only in Emacs 20.7 and older versions,
2914   but newer versions can safely decode it.
2915   (**) This form is used only in Emacs 21.1 and newer versions,
2916   and older versions can't decode it.
2917
2918   Here's a list of example usages of these composition escape
2919   sequences (categorized by `enum composition_method').
2920
2921   COMPOSITION_RELATIVE:
2922         ESC 0 CHAR [ CHAR ] ESC 1
2923   COMPOSITION_WITH_RULE:
2924         ESC 2 CHAR [ RULE CHAR ] ESC 1
2925   COMPOSITION_WITH_ALTCHARS:
2926         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2927   COMPOSITION_WITH_RULE_ALTCHARS:
2928         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2929
2930 static enum iso_code_class_type iso_code_class[256];
2931
2932 #define SAFE_CHARSET_P(coding, id)      \
2933   ((id) <= (coding)->max_charset_id     \
2934    && (coding)->safe_charsets[id] != 255)
2935
2936 static void
2937 setup_iso_safe_charsets (Lisp_Object attrs)
2938 {
2939   Lisp_Object charset_list, safe_charsets;
2940   Lisp_Object request;
2941   Lisp_Object reg_usage;
2942   Lisp_Object tail;
2943   EMACS_INT reg94, reg96;
2944   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2945   int max_charset_id;
2946
2947   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2948   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2949       && ! EQ (charset_list, Viso_2022_charset_list))
2950     {
2951       charset_list = Viso_2022_charset_list;
2952       ASET (attrs, coding_attr_charset_list, charset_list);
2953       ASET (attrs, coding_attr_safe_charsets, Qnil);
2954     }
2955
2956   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2957     return;
2958
2959   max_charset_id = 0;
2960   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2961     {
2962       int id = XINT (XCAR (tail));
2963       if (max_charset_id < id)
2964         max_charset_id = id;
2965     }
2966
2967   safe_charsets = make_uninit_string (max_charset_id + 1);
2968   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2969   request = AREF (attrs, coding_attr_iso_request);
2970   reg_usage = AREF (attrs, coding_attr_iso_usage);
2971   reg94 = XINT (XCAR (reg_usage));
2972   reg96 = XINT (XCDR (reg_usage));
2973
2974   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2975     {
2976       Lisp_Object id;
2977       Lisp_Object reg;
2978       struct charset *charset;
2979
2980       id = XCAR (tail);
2981       charset = CHARSET_FROM_ID (XINT (id));
2982       reg = Fcdr (Fassq (id, request));
2983       if (! NILP (reg))
2984         SSET (safe_charsets, XINT (id), XINT (reg));
2985       else if (charset->iso_chars_96)
2986         {
2987           if (reg96 < 4)
2988             SSET (safe_charsets, XINT (id), reg96);
2989         }
2990       else
2991         {
2992           if (reg94 < 4)
2993             SSET (safe_charsets, XINT (id), reg94);
2994         }
2995     }
2996   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2997 }
2998
2999
3000 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3001    Return true if a text is encoded in one of ISO-2022 based coding
3002    systems.  */
3003
3004 static bool
3005 detect_coding_iso_2022 (struct coding_system *coding,
3006                         struct coding_detection_info *detect_info)
3007 {
3008   const unsigned char *src = coding->source, *src_base = src;
3009   const unsigned char *src_end = coding->source + coding->src_bytes;
3010   bool multibytep = coding->src_multibyte;
3011   bool single_shifting = 0;
3012   int id;
3013   int c, c1;
3014   ptrdiff_t consumed_chars = 0;
3015   int i;
3016   int rejected = 0;
3017   int found = 0;
3018   int composition_count = -1;
3019
3020   detect_info->checked |= CATEGORY_MASK_ISO;
3021
3022   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3023     {
3024       struct coding_system *this = &(coding_categories[i]);
3025       Lisp_Object attrs, val;
3026
3027       if (this->id < 0)
3028         continue;
3029       attrs = CODING_ID_ATTRS (this->id);
3030       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3031           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3032         setup_iso_safe_charsets (attrs);
3033       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3034       this->max_charset_id = SCHARS (val) - 1;
3035       this->safe_charsets = SDATA (val);
3036     }
3037
3038   /* A coding system of this category is always ASCII compatible.  */
3039   src += coding->head_ascii;
3040
3041   while (rejected != CATEGORY_MASK_ISO)
3042     {
3043       src_base = src;
3044       ONE_MORE_BYTE (c);
3045       switch (c)
3046         {
3047         case ISO_CODE_ESC:
3048           if (inhibit_iso_escape_detection)
3049             break;
3050           single_shifting = 0;
3051           ONE_MORE_BYTE (c);
3052           if (c == 'N' || c == 'O')
3053             {
3054               /* ESC <Fe> for SS2 or SS3.  */
3055               single_shifting = 1;
3056               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3057             }
3058           else if (c == '1')
3059             {
3060               /* End of composition.  */
3061               if (composition_count < 0
3062                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3063                 /* Invalid */
3064                 break;
3065               composition_count = -1;
3066               found |= CATEGORY_MASK_ISO;
3067             }
3068           else if (c >= '0' && c <= '4')
3069             {
3070               /* ESC <Fp> for start/end composition.  */
3071               composition_count = 0;
3072             }
3073           else
3074             {
3075               if (c >= '(' && c <= '/')
3076                 {
3077                   /* Designation sequence for a charset of dimension 1.  */
3078                   ONE_MORE_BYTE (c1);
3079                   if (c1 < ' ' || c1 >= 0x80
3080                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3081                     /* Invalid designation sequence.  Just ignore.  */
3082                     break;
3083                 }
3084               else if (c == '$')
3085                 {
3086                   /* Designation sequence for a charset of dimension 2.  */
3087                   ONE_MORE_BYTE (c);
3088                   if (c >= '@' && c <= 'B')
3089                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3090                     id = iso_charset_table[1][0][c];
3091                   else if (c >= '(' && c <= '/')
3092                     {
3093                       ONE_MORE_BYTE (c1);
3094                       if (c1 < ' ' || c1 >= 0x80
3095                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3096                         /* Invalid designation sequence.  Just ignore.  */
3097                         break;
3098                     }
3099                   else
3100                     /* Invalid designation sequence.  Just ignore it.  */
3101                     break;
3102                 }
3103               else
3104                 {
3105                   /* Invalid escape sequence.  Just ignore it.  */
3106                   break;
3107                 }
3108
3109               /* We found a valid designation sequence for CHARSET.  */
3110               rejected |= CATEGORY_MASK_ISO_8BIT;
3111               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3112                                   id))
3113                 found |= CATEGORY_MASK_ISO_7;
3114               else
3115                 rejected |= CATEGORY_MASK_ISO_7;
3116               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3117                                   id))
3118                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3119               else
3120                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3121               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3122                                   id))
3123                 found |= CATEGORY_MASK_ISO_7_ELSE;
3124               else
3125                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3126               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3127                                   id))
3128                 found |= CATEGORY_MASK_ISO_8_ELSE;
3129               else
3130                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3131             }
3132           break;
3133
3134         case ISO_CODE_SO:
3135         case ISO_CODE_SI:
3136           /* Locking shift out/in.  */
3137           if (inhibit_iso_escape_detection)
3138             break;
3139           single_shifting = 0;
3140           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3141           break;
3142
3143         case ISO_CODE_CSI:
3144           /* Control sequence introducer.  */
3145           single_shifting = 0;
3146           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3147           found |= CATEGORY_MASK_ISO_8_ELSE;
3148           goto check_extra_latin;
3149
3150         case ISO_CODE_SS2:
3151         case ISO_CODE_SS3:
3152           /* Single shift.   */
3153           if (inhibit_iso_escape_detection)
3154             break;
3155           single_shifting = 0;
3156           rejected |= CATEGORY_MASK_ISO_7BIT;
3157           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3158               & CODING_ISO_FLAG_SINGLE_SHIFT)
3159             {
3160               found |= CATEGORY_MASK_ISO_8_1;
3161               single_shifting = 1;
3162             }
3163           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3164               & CODING_ISO_FLAG_SINGLE_SHIFT)
3165             {
3166               found |= CATEGORY_MASK_ISO_8_2;
3167               single_shifting = 1;
3168             }
3169           if (single_shifting)
3170             break;
3171           goto check_extra_latin;
3172
3173         default:
3174           if (c < 0)
3175             continue;
3176           if (c < 0x80)
3177             {
3178               if (composition_count >= 0)
3179                 composition_count++;
3180               single_shifting = 0;
3181               break;
3182             }
3183           if (c >= 0xA0)
3184             {
3185               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3186               found |= CATEGORY_MASK_ISO_8_1;
3187               /* Check the length of succeeding codes of the range
3188                  0xA0..0FF.  If the byte length is even, we include
3189                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3190                  only when we are not single shifting.  */
3191               if (! single_shifting
3192                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3193                 {
3194                   ptrdiff_t len = 1;
3195                   while (src < src_end)
3196                     {
3197                       src_base = src;
3198                       ONE_MORE_BYTE (c);
3199                       if (c < 0xA0)
3200                         {
3201                           src = src_base;
3202                           break;
3203                         }
3204                       len++;
3205                     }
3206
3207                   if (len & 1 && src < src_end)
3208                     {
3209                       rejected |= CATEGORY_MASK_ISO_8_2;
3210                       if (composition_count >= 0)
3211                         composition_count += len;
3212                     }
3213                   else
3214                     {
3215                       found |= CATEGORY_MASK_ISO_8_2;
3216                       if (composition_count >= 0)
3217                         composition_count += len / 2;
3218                     }
3219                 }
3220               break;
3221             }
3222         check_extra_latin:
3223           if (! VECTORP (Vlatin_extra_code_table)
3224               || NILP (AREF (Vlatin_extra_code_table, c)))
3225             {
3226               rejected = CATEGORY_MASK_ISO;
3227               break;
3228             }
3229           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3230               & CODING_ISO_FLAG_LATIN_EXTRA)
3231             found |= CATEGORY_MASK_ISO_8_1;
3232           else
3233             rejected |= CATEGORY_MASK_ISO_8_1;
3234           rejected |= CATEGORY_MASK_ISO_8_2;
3235           break;
3236         }
3237     }
3238   detect_info->rejected |= CATEGORY_MASK_ISO;
3239   return 0;
3240
3241  no_more_source:
3242   detect_info->rejected |= rejected;
3243   detect_info->found |= (found & ~rejected);
3244   return 1;
3245 }
3246
3247
3248 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3249    escape sequence should be kept.  */
3250 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3251   do {                                                                  \
3252     int id, prev;                                                       \
3253                                                                         \
3254     if (final < '0' || final >= 128                                     \
3255         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3256         || !SAFE_CHARSET_P (coding, id))                                \
3257       {                                                                 \
3258         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3259         chars_96 = -1;                                                  \
3260         break;                                                          \
3261       }                                                                 \
3262     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3263     if (id == charset_jisx0201_roman)                                   \
3264       {                                                                 \
3265         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3266           id = charset_ascii;                                           \
3267       }                                                                 \
3268     else if (id == charset_jisx0208_1978)                               \
3269       {                                                                 \
3270         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3271           id = charset_jisx0208;                                        \
3272       }                                                                 \
3273     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3274     /* If there was an invalid designation to REG previously, and this  \
3275        designation is ASCII to REG, we should keep this designation     \
3276        sequence.  */                                                    \
3277     if (prev == -2 && id == charset_ascii)                              \
3278       chars_96 = -1;                                                    \
3279   } while (0)
3280
3281
3282 /* Handle these composition sequence (ALT: alternate char):
3283
3284    (1) relative composition: ESC 0 CHAR ... ESC 1
3285    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3286    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3287    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3288
3289    When the start sequence (ESC 0/2/3/4) is found, this annotation
3290    header is produced.
3291
3292         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3293
3294    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3295    produced until the end sequence (ESC 1) is found:
3296
3297    (1) CHAR ... CHAR
3298    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3299    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3300    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3301
3302    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3303    annotation header is updated as below:
3304
3305    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3306    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3307    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3308    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3309
3310    If an error is found while composing, the annotation header is
3311    changed to:
3312
3313         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3314
3315    and the sequence [ -2 DECODED-RULE ] is changed to the original
3316    byte sequence as below:
3317         o the original byte sequence is B: [ B -1 ]
3318         o the original byte sequence is B1 B2: [ B1 B2 ]
3319    and the sequence [ -1 -1 ] is changed to the original byte
3320    sequence:
3321         [ ESC '0' ]
3322 */
3323
3324 /* Decode a composition rule C1 and maybe one more byte from the
3325    source, and set RULE to the encoded composition rule.  If the rule
3326    is invalid, goto invalid_code.  */
3327
3328 #define DECODE_COMPOSITION_RULE(rule)                                   \
3329   do {                                                                  \
3330     rule = c1 - 32;                                                     \
3331     if (rule < 0)                                                       \
3332       goto invalid_code;                                                \
3333     if (rule < 81)              /* old format (before ver.21) */        \
3334       {                                                                 \
3335         int gref = (rule) / 9;                                          \
3336         int nref = (rule) % 9;                                          \
3337         if (gref == 4) gref = 10;                                       \
3338         if (nref == 4) nref = 10;                                       \
3339         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3340       }                                                                 \
3341     else                        /* new format (after ver.21) */         \
3342       {                                                                 \
3343         int b;                                                          \
3344                                                                         \
3345         ONE_MORE_BYTE (b);                                              \
3346         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3347           goto invalid_code;                                            \
3348         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3349         rule += 0x100;   /* Distinguish it from the old format.  */     \
3350       }                                                                 \
3351   } while (0)
3352
3353 #define ENCODE_COMPOSITION_RULE(rule)                           \
3354   do {                                                          \
3355     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3356                                                                 \
3357     if (rule < 0x100)           /* old format */                \
3358       {                                                         \
3359         if (gref == 10) gref = 4;                               \
3360         if (nref == 10) nref = 4;                               \
3361         charbuf[idx] = 32 + gref * 9 + nref;                    \
3362         charbuf[idx + 1] = -1;                                  \
3363         new_chars++;                                            \
3364       }                                                         \
3365     else                                /* new format */        \
3366       {                                                         \
3367         charbuf[idx] = 32 + 81 + gref;                          \
3368         charbuf[idx + 1] = 32 + nref;                           \
3369         new_chars += 2;                                         \
3370       }                                                         \
3371   } while (0)
3372
3373 /* Finish the current composition as invalid.  */
3374
3375 static int
3376 finish_composition (int *charbuf, struct composition_status *cmp_status)
3377 {
3378   int idx = - cmp_status->length;
3379   int new_chars;
3380
3381   /* Recover the original ESC sequence */
3382   charbuf[idx++] = ISO_CODE_ESC;
3383   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3384                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3385                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3386                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3387                     : '4');
3388   charbuf[idx++] = -2;
3389   charbuf[idx++] = 0;
3390   charbuf[idx++] = -1;
3391   new_chars = cmp_status->nchars;
3392   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3393     for (; idx < 0; idx++)
3394       {
3395         int elt = charbuf[idx];
3396
3397         if (elt == -2)
3398           {
3399             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3400             idx++;
3401           }
3402         else if (elt == -1)
3403           {
3404             charbuf[idx++] = ISO_CODE_ESC;
3405             charbuf[idx] = '0';
3406             new_chars += 2;
3407           }
3408       }
3409   cmp_status->state = COMPOSING_NO;
3410   return new_chars;
3411 }
3412
3413 /* If characters are under composition, finish the composition.  */
3414 #define MAYBE_FINISH_COMPOSITION()                              \
3415   do {                                                          \
3416     if (cmp_status->state != COMPOSING_NO)                      \
3417       char_offset += finish_composition (charbuf, cmp_status);  \
3418   } while (0)
3419
3420 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3421
3422    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3423    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3424    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3425    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3426
3427    Produce this annotation sequence now:
3428
3429    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3430 */
3431
3432 #define DECODE_COMPOSITION_START(c1)                                       \
3433   do {                                                                     \
3434     if (c1 == '0'                                                          \
3435         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3436              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3437             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3438                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3439       {                                                                    \
3440         *charbuf++ = -1;                                                   \
3441         *charbuf++= -1;                                                    \
3442         cmp_status->state = COMPOSING_CHAR;                                \
3443         cmp_status->length += 2;                                           \
3444       }                                                                    \
3445     else                                                                   \
3446       {                                                                    \
3447         MAYBE_FINISH_COMPOSITION ();                                       \
3448         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3449                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3450                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3451                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3452         cmp_status->state                                                  \
3453           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3454         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3455         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3456         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3457         coding->annotated = 1;                                             \
3458       }                                                                    \
3459   } while (0)
3460
3461
3462 /* Handle composition end sequence ESC 1.  */
3463
3464 #define DECODE_COMPOSITION_END()                                        \
3465   do {                                                                  \
3466     if (cmp_status->nchars == 0                                         \
3467         || ((cmp_status->state == COMPOSING_CHAR)                       \
3468             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3469       {                                                                 \
3470         MAYBE_FINISH_COMPOSITION ();                                    \
3471         goto invalid_code;                                              \
3472       }                                                                 \
3473     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3474       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3475     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3476       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3477     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3478     char_offset += cmp_status->nchars;                                  \
3479     cmp_status->state = COMPOSING_NO;                                   \
3480   } while (0)
3481
3482 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3483
3484 #define STORE_COMPOSITION_RULE(rule)    \
3485   do {                                  \
3486     *charbuf++ = -2;                    \
3487     *charbuf++ = rule;                  \
3488     cmp_status->length += 2;            \
3489     cmp_status->state--;                \
3490   } while (0)
3491
3492 /* Store a composed char or a component char C in charbuf, and update
3493    cmp_status.  */
3494
3495 #define STORE_COMPOSITION_CHAR(c)                                       \
3496   do {                                                                  \
3497     *charbuf++ = (c);                                                   \
3498     cmp_status->length++;                                               \
3499     if (cmp_status->state == COMPOSING_CHAR)                            \
3500       cmp_status->nchars++;                                             \
3501     else                                                                \
3502       cmp_status->ncomps++;                                             \
3503     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3504         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3505             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3506       cmp_status->state++;                                              \
3507   } while (0)
3508
3509
3510 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3511
3512 static void
3513 decode_coding_iso_2022 (struct coding_system *coding)
3514 {
3515   const unsigned char *src = coding->source + coding->consumed;
3516   const unsigned char *src_end = coding->source + coding->src_bytes;
3517   const unsigned char *src_base;
3518   int *charbuf = coding->charbuf + coding->charbuf_used;
3519   /* We may produce two annotations (charset and composition) in one
3520      loop and one more charset annotation at the end.  */
3521   int *charbuf_end
3522     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3523   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3524   bool multibytep = coding->src_multibyte;
3525   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3526   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3527   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3528   int charset_id_2, charset_id_3;
3529   struct charset *charset;
3530   int c;
3531   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3532   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3533   ptrdiff_t char_offset = coding->produced_char;
3534   ptrdiff_t last_offset = char_offset;
3535   int last_id = charset_ascii;
3536   bool eol_dos
3537     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3538   int byte_after_cr = -1;
3539   int i;
3540
3541   setup_iso_safe_charsets (attrs);
3542   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3543
3544   if (cmp_status->state != COMPOSING_NO)
3545     {
3546       if (charbuf_end - charbuf < cmp_status->length)
3547         emacs_abort ();
3548       for (i = 0; i < cmp_status->length; i++)
3549         *charbuf++ = cmp_status->carryover[i];
3550       coding->annotated = 1;
3551     }
3552
3553   while (1)
3554     {
3555       int c1, c2, c3;
3556
3557       src_base = src;
3558       consumed_chars_base = consumed_chars;
3559
3560       if (charbuf >= charbuf_end)
3561         {
3562           if (byte_after_cr >= 0)
3563             src_base--;
3564           break;
3565         }
3566
3567       if (byte_after_cr >= 0)
3568         c1 = byte_after_cr, byte_after_cr = -1;
3569       else
3570         ONE_MORE_BYTE (c1);
3571       if (c1 < 0)
3572         goto invalid_code;
3573
3574       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3575         {
3576           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3577           char_offset++;
3578           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3579           continue;
3580         }
3581
3582       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3583         {
3584           if (c1 == ISO_CODE_ESC)
3585             {
3586               if (src + 1 >= src_end)
3587                 goto no_more_source;
3588               *charbuf++ = ISO_CODE_ESC;
3589               char_offset++;
3590               if (src[0] == '%' && src[1] == '@')
3591                 {
3592                   src += 2;
3593                   consumed_chars += 2;
3594                   char_offset += 2;
3595                   /* We are sure charbuf can contain two more chars. */
3596                   *charbuf++ = '%';
3597                   *charbuf++ = '@';
3598                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3599                 }
3600             }
3601           else
3602             {
3603               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3604               char_offset++;
3605             }
3606           continue;
3607         }
3608
3609       if ((cmp_status->state == COMPOSING_RULE
3610            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3611           && c1 != ISO_CODE_ESC)
3612         {
3613           int rule;
3614
3615           DECODE_COMPOSITION_RULE (rule);
3616           STORE_COMPOSITION_RULE (rule);
3617           continue;
3618         }
3619
3620       /* We produce at most one character.  */
3621       switch (iso_code_class [c1])
3622         {
3623         case ISO_0x20_or_0x7F:
3624           if (charset_id_0 < 0
3625               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3626             /* This is SPACE or DEL.  */
3627             charset = CHARSET_FROM_ID (charset_ascii);
3628           else
3629             charset = CHARSET_FROM_ID (charset_id_0);
3630           break;
3631
3632         case ISO_graphic_plane_0:
3633           if (charset_id_0 < 0)
3634             charset = CHARSET_FROM_ID (charset_ascii);
3635           else
3636             charset = CHARSET_FROM_ID (charset_id_0);
3637           break;
3638
3639         case ISO_0xA0_or_0xFF:
3640           if (charset_id_1 < 0
3641               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3642               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3643             goto invalid_code;
3644           /* This is a graphic character, we fall down ... */
3645
3646         case ISO_graphic_plane_1:
3647           if (charset_id_1 < 0)
3648             goto invalid_code;
3649           charset = CHARSET_FROM_ID (charset_id_1);
3650           break;
3651
3652         case ISO_control_0:
3653           if (eol_dos && c1 == '\r')
3654             ONE_MORE_BYTE (byte_after_cr);
3655           MAYBE_FINISH_COMPOSITION ();
3656           charset = CHARSET_FROM_ID (charset_ascii);
3657           break;
3658
3659         case ISO_control_1:
3660           goto invalid_code;
3661
3662         case ISO_shift_out:
3663           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3664               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3665             goto invalid_code;
3666           CODING_ISO_INVOCATION (coding, 0) = 1;
3667           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3668           continue;
3669
3670         case ISO_shift_in:
3671           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3672             goto invalid_code;
3673           CODING_ISO_INVOCATION (coding, 0) = 0;
3674           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3675           continue;
3676
3677         case ISO_single_shift_2_7:
3678           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3679             goto invalid_code;
3680         case ISO_single_shift_2:
3681           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3682             goto invalid_code;
3683           /* SS2 is handled as an escape sequence of ESC 'N' */
3684           c1 = 'N';
3685           goto label_escape_sequence;
3686
3687         case ISO_single_shift_3:
3688           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3689             goto invalid_code;
3690           /* SS2 is handled as an escape sequence of ESC 'O' */
3691           c1 = 'O';
3692           goto label_escape_sequence;
3693
3694         case ISO_control_sequence_introducer:
3695           /* CSI is handled as an escape sequence of ESC '[' ...  */
3696           c1 = '[';
3697           goto label_escape_sequence;
3698
3699         case ISO_escape:
3700           ONE_MORE_BYTE (c1);
3701         label_escape_sequence:
3702           /* Escape sequences handled here are invocation,
3703              designation, direction specification, and character
3704              composition specification.  */
3705           switch (c1)
3706             {
3707             case '&':           /* revision of following character set */
3708               ONE_MORE_BYTE (c1);
3709               if (!(c1 >= '@' && c1 <= '~'))
3710                 goto invalid_code;
3711               ONE_MORE_BYTE (c1);
3712               if (c1 != ISO_CODE_ESC)
3713                 goto invalid_code;
3714               ONE_MORE_BYTE (c1);
3715               goto label_escape_sequence;
3716
3717             case '$':           /* designation of 2-byte character set */
3718               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3719                 goto invalid_code;
3720               {
3721                 int reg, chars96;
3722
3723                 ONE_MORE_BYTE (c1);
3724                 if (c1 >= '@' && c1 <= 'B')
3725                   {     /* designation of JISX0208.1978, GB2312.1980,
3726                            or JISX0208.1980 */
3727                     reg = 0, chars96 = 0;
3728                   }
3729                 else if (c1 >= 0x28 && c1 <= 0x2B)
3730                   { /* designation of DIMENSION2_CHARS94 character set */
3731                     reg = c1 - 0x28, chars96 = 0;
3732                     ONE_MORE_BYTE (c1);
3733                   }
3734                 else if (c1 >= 0x2C && c1 <= 0x2F)
3735                   { /* designation of DIMENSION2_CHARS96 character set */
3736                     reg = c1 - 0x2C, chars96 = 1;
3737                     ONE_MORE_BYTE (c1);
3738                   }
3739                 else
3740                   goto invalid_code;
3741                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3742                 /* We must update these variables now.  */
3743                 if (reg == 0)
3744                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3745                 else if (reg == 1)
3746                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3747                 if (chars96 < 0)
3748                   goto invalid_code;
3749               }
3750               continue;
3751
3752             case 'n':           /* invocation of locking-shift-2 */
3753               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3754                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3755                 goto invalid_code;
3756               CODING_ISO_INVOCATION (coding, 0) = 2;
3757               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3758               continue;
3759
3760             case 'o':           /* invocation of locking-shift-3 */
3761               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3762                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3763                 goto invalid_code;
3764               CODING_ISO_INVOCATION (coding, 0) = 3;
3765               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3766               continue;
3767
3768             case 'N':           /* invocation of single-shift-2 */
3769               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3770                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3771                 goto invalid_code;
3772               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3773               if (charset_id_2 < 0)
3774                 charset = CHARSET_FROM_ID (charset_ascii);
3775               else
3776                 charset = CHARSET_FROM_ID (charset_id_2);
3777               ONE_MORE_BYTE (c1);
3778               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3779                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3780                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3781                           ? c1 >= 0x80 : c1 < 0x80)))
3782                 goto invalid_code;
3783               break;
3784
3785             case 'O':           /* invocation of single-shift-3 */
3786               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3787                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3788                 goto invalid_code;
3789               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3790               if (charset_id_3 < 0)
3791                 charset = CHARSET_FROM_ID (charset_ascii);
3792               else
3793                 charset = CHARSET_FROM_ID (charset_id_3);
3794               ONE_MORE_BYTE (c1);
3795               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3796                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3797                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3798                           ? c1 >= 0x80 : c1 < 0x80)))
3799                 goto invalid_code;
3800               break;
3801
3802             case '0': case '2': case '3': case '4': /* start composition */
3803               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3804                 goto invalid_code;
3805               if (last_id != charset_ascii)
3806                 {
3807                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3808                   last_id = charset_ascii;
3809                   last_offset = char_offset;
3810                 }
3811               DECODE_COMPOSITION_START (c1);
3812               continue;
3813
3814             case '1':           /* end composition */
3815               if (cmp_status->state == COMPOSING_NO)
3816                 goto invalid_code;
3817               DECODE_COMPOSITION_END ();
3818               continue;
3819
3820             case '[':           /* specification of direction */
3821               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3822                 goto invalid_code;
3823               /* For the moment, nested direction is not supported.
3824                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3825                  left-to-right, and nonzero means right-to-left.  */
3826               ONE_MORE_BYTE (c1);
3827               switch (c1)
3828                 {
3829                 case ']':       /* end of the current direction */
3830                   coding->mode &= ~CODING_MODE_DIRECTION;
3831
3832                 case '0':       /* end of the current direction */
3833                 case '1':       /* start of left-to-right direction */
3834                   ONE_MORE_BYTE (c1);
3835                   if (c1 == ']')
3836                     coding->mode &= ~CODING_MODE_DIRECTION;
3837                   else
3838                     goto invalid_code;
3839                   break;
3840
3841                 case '2':       /* start of right-to-left direction */
3842                   ONE_MORE_BYTE (c1);
3843                   if (c1 == ']')
3844                     coding->mode |= CODING_MODE_DIRECTION;
3845                   else
3846                     goto invalid_code;
3847                   break;
3848
3849                 default:
3850                   goto invalid_code;
3851                 }
3852               continue;
3853
3854             case '%':
3855               ONE_MORE_BYTE (c1);
3856               if (c1 == '/')
3857                 {
3858                   /* CTEXT extended segment:
3859                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3860                      We keep these bytes as is for the moment.
3861                      They may be decoded by post-read-conversion.  */
3862                   int dim, M, L;
3863                   int size;
3864
3865                   ONE_MORE_BYTE (dim);
3866                   if (dim < '0' || dim > '4')
3867                     goto invalid_code;
3868                   ONE_MORE_BYTE (M);
3869                   if (M < 128)
3870                     goto invalid_code;
3871                   ONE_MORE_BYTE (L);
3872                   if (L < 128)
3873                     goto invalid_code;
3874                   size = ((M - 128) * 128) + (L - 128);
3875                   if (charbuf + 6 > charbuf_end)
3876                     goto break_loop;
3877                   *charbuf++ = ISO_CODE_ESC;
3878                   *charbuf++ = '%';
3879                   *charbuf++ = '/';
3880                   *charbuf++ = dim;
3881                   *charbuf++ = BYTE8_TO_CHAR (M);
3882                   *charbuf++ = BYTE8_TO_CHAR (L);
3883                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3884                 }
3885               else if (c1 == 'G')
3886                 {
3887                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3888                      ESC % G --UTF-8-BYTES-- ESC % @
3889                      We keep these bytes as is for the moment.
3890                      They may be decoded by post-read-conversion.  */
3891                   if (charbuf + 3 > charbuf_end)
3892                     goto break_loop;
3893                   *charbuf++ = ISO_CODE_ESC;
3894                   *charbuf++ = '%';
3895                   *charbuf++ = 'G';
3896                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3897                 }
3898               else
3899                 goto invalid_code;
3900               continue;
3901               break;
3902
3903             default:
3904               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3905                 goto invalid_code;
3906               {
3907                 int reg, chars96;
3908
3909                 if (c1 >= 0x28 && c1 <= 0x2B)
3910                   { /* designation of DIMENSION1_CHARS94 character set */
3911                     reg = c1 - 0x28, chars96 = 0;
3912                     ONE_MORE_BYTE (c1);
3913                   }
3914                 else if (c1 >= 0x2C && c1 <= 0x2F)
3915                   { /* designation of DIMENSION1_CHARS96 character set */
3916                     reg = c1 - 0x2C, chars96 = 1;
3917                     ONE_MORE_BYTE (c1);
3918                   }
3919                 else
3920                   goto invalid_code;
3921                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3922                 /* We must update these variables now.  */
3923                 if (reg == 0)
3924                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3925                 else if (reg == 1)
3926                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3927                 if (chars96 < 0)
3928                   goto invalid_code;
3929               }
3930               continue;
3931             }
3932           break;
3933
3934         default:
3935           emacs_abort ();
3936         }
3937
3938       if (cmp_status->state == COMPOSING_NO
3939           && charset->id != charset_ascii
3940           && last_id != charset->id)
3941         {
3942           if (last_id != charset_ascii)
3943             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3944           last_id = charset->id;
3945           last_offset = char_offset;
3946         }
3947
3948       /* Now we know CHARSET and 1st position code C1 of a character.
3949          Produce a decoded character while getting 2nd and 3rd
3950          position codes C2, C3 if necessary.  */
3951       if (CHARSET_DIMENSION (charset) > 1)
3952         {
3953           ONE_MORE_BYTE (c2);
3954           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3955               || ((c1 & 0x80) != (c2 & 0x80)))
3956             /* C2 is not in a valid range.  */
3957             goto invalid_code;
3958           if (CHARSET_DIMENSION (charset) == 2)
3959             c1 = (c1 << 8) | c2;
3960           else
3961             {
3962               ONE_MORE_BYTE (c3);
3963               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3964                   || ((c1 & 0x80) != (c3 & 0x80)))
3965                 /* C3 is not in a valid range.  */
3966                 goto invalid_code;
3967               c1 = (c1 << 16) | (c2 << 8) | c2;
3968             }
3969         }
3970       c1 &= 0x7F7F7F;
3971       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3972       if (c < 0)
3973         {
3974           MAYBE_FINISH_COMPOSITION ();
3975           for (; src_base < src; src_base++, char_offset++)
3976             {
3977               if (ASCII_CHAR_P (*src_base))
3978                 *charbuf++ = *src_base;
3979               else
3980                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3981             }
3982         }
3983       else if (cmp_status->state == COMPOSING_NO)
3984         {
3985           *charbuf++ = c;
3986           char_offset++;
3987         }
3988       else if ((cmp_status->state == COMPOSING_CHAR
3989                 ? cmp_status->nchars
3990                 : cmp_status->ncomps)
3991                >= MAX_COMPOSITION_COMPONENTS)
3992         {
3993           /* Too long composition.  */
3994           MAYBE_FINISH_COMPOSITION ();
3995           *charbuf++ = c;
3996           char_offset++;
3997         }
3998       else
3999         STORE_COMPOSITION_CHAR (c);
4000       continue;
4001
4002     invalid_code:
4003       MAYBE_FINISH_COMPOSITION ();
4004       src = src_base;
4005       consumed_chars = consumed_chars_base;
4006       ONE_MORE_BYTE (c);
4007       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
4008       char_offset++;
4009       coding->errors++;
4010       /* Reset the invocation and designation status to the safest
4011          one; i.e. designate ASCII to the graphic register 0, and
4012          invoke that register to the graphic plane 0.  This typically
4013          helps the case that an designation sequence for ASCII "ESC (
4014          B" is somehow broken (e.g. broken by a newline).  */
4015       CODING_ISO_INVOCATION (coding, 0) = 0;
4016       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4017       charset_id_0 = charset_ascii;
4018       continue;
4019
4020     break_loop:
4021       break;
4022     }
4023
4024  no_more_source:
4025   if (cmp_status->state != COMPOSING_NO)
4026     {
4027       if (coding->mode & CODING_MODE_LAST_BLOCK)
4028         MAYBE_FINISH_COMPOSITION ();
4029       else
4030         {
4031           charbuf -= cmp_status->length;
4032           for (i = 0; i < cmp_status->length; i++)
4033             cmp_status->carryover[i] = charbuf[i];
4034         }
4035     }
4036   else if (last_id != charset_ascii)
4037     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4038   coding->consumed_char += consumed_chars_base;
4039   coding->consumed = src_base - coding->source;
4040   coding->charbuf_used = charbuf - coding->charbuf;
4041 }
4042
4043
4044 /* ISO2022 encoding stuff.  */
4045
4046 /*
4047    It is not enough to say just "ISO2022" on encoding, we have to
4048    specify more details.  In Emacs, each coding system of ISO2022
4049    variant has the following specifications:
4050         1. Initial designation to G0 thru G3.
4051         2. Allows short-form designation?
4052         3. ASCII should be designated to G0 before control characters?
4053         4. ASCII should be designated to G0 at end of line?
4054         5. 7-bit environment or 8-bit environment?
4055         6. Use locking-shift?
4056         7. Use Single-shift?
4057    And the following two are only for Japanese:
4058         8. Use ASCII in place of JIS0201-1976-Roman?
4059         9. Use JISX0208-1983 in place of JISX0208-1978?
4060    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4061    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4062    details.
4063 */
4064
4065 /* Produce codes (escape sequence) for designating CHARSET to graphic
4066    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4067    '@', 'A', or 'B' and the coding system CODING allows, produce
4068    designation sequence of short-form.  */
4069
4070 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4071   do {                                                                  \
4072     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4073     const char *intermediate_char_94 = "()*+";                          \
4074     const char *intermediate_char_96 = ",-./";                          \
4075     int revision = -1;                                                  \
4076                                                                         \
4077     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4078       revision = CHARSET_ISO_REVISION (charset);                        \
4079                                                                         \
4080     if (revision >= 0)                                                  \
4081       {                                                                 \
4082         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4083         EMIT_ONE_BYTE ('@' + revision);                                 \
4084       }                                                                 \
4085     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4086     if (CHARSET_DIMENSION (charset) == 1)                               \
4087       {                                                                 \
4088         int b;                                                          \
4089         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4090           b = intermediate_char_94[reg];                                \
4091         else                                                            \
4092           b = intermediate_char_96[reg];                                \
4093         EMIT_ONE_ASCII_BYTE (b);                                        \
4094       }                                                                 \
4095     else                                                                \
4096       {                                                                 \
4097         EMIT_ONE_ASCII_BYTE ('$');                                      \
4098         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4099           {                                                             \
4100             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4101                 || reg != 0                                             \
4102                 || final_char < '@' || final_char > 'B')                \
4103               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4104           }                                                             \
4105         else                                                            \
4106           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4107       }                                                                 \
4108     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4109                                                                         \
4110     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4111   } while (0)
4112
4113
4114 /* The following two macros produce codes (control character or escape
4115    sequence) for ISO2022 single-shift functions (single-shift-2 and
4116    single-shift-3).  */
4117
4118 #define ENCODE_SINGLE_SHIFT_2                                           \
4119   do {                                                                  \
4120     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4121       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4122     else                                                                \
4123       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4124     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4125   } while (0)
4126
4127
4128 #define ENCODE_SINGLE_SHIFT_3                                           \
4129   do {                                                                  \
4130     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4131       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4132     else                                                                \
4133       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4134     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4135   } while (0)
4136
4137
4138 /* The following four macros produce codes (control character or
4139    escape sequence) for ISO2022 locking-shift functions (shift-in,
4140    shift-out, locking-shift-2, and locking-shift-3).  */
4141
4142 #define ENCODE_SHIFT_IN                                 \
4143   do {                                                  \
4144     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4145     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4146   } while (0)
4147
4148
4149 #define ENCODE_SHIFT_OUT                                \
4150   do {                                                  \
4151     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4152     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4153   } while (0)
4154
4155
4156 #define ENCODE_LOCKING_SHIFT_2                          \
4157   do {                                                  \
4158     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4159     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4160   } while (0)
4161
4162
4163 #define ENCODE_LOCKING_SHIFT_3                          \
4164   do {                                                  \
4165     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4166     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4167   } while (0)
4168
4169
4170 /* Produce codes for a DIMENSION1 character whose character set is
4171    CHARSET and whose position-code is C1.  Designation and invocation
4172    sequences are also produced in advance if necessary.  */
4173
4174 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4175   do {                                                                  \
4176     int id = CHARSET_ID (charset);                                      \
4177                                                                         \
4178     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4179         && id == charset_ascii)                                         \
4180       {                                                                 \
4181         id = charset_jisx0201_roman;                                    \
4182         charset = CHARSET_FROM_ID (id);                                 \
4183       }                                                                 \
4184                                                                         \
4185     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4186       {                                                                 \
4187         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4188           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4189         else                                                            \
4190           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4191         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4192         break;                                                          \
4193       }                                                                 \
4194     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4195       {                                                                 \
4196         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4197         break;                                                          \
4198       }                                                                 \
4199     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4200       {                                                                 \
4201         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4202         break;                                                          \
4203       }                                                                 \
4204     else                                                                \
4205       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4206          must invoke it, or, at first, designate it to some graphic     \
4207          register.  Then repeat the loop to actually produce the        \
4208          character.  */                                                 \
4209       dst = encode_invocation_designation (charset, coding, dst,        \
4210                                            &produced_chars);            \
4211   } while (1)
4212
4213
4214 /* Produce codes for a DIMENSION2 character whose character set is
4215    CHARSET and whose position-codes are C1 and C2.  Designation and
4216    invocation codes are also produced in advance if necessary.  */
4217
4218 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4219   do {                                                                  \
4220     int id = CHARSET_ID (charset);                                      \
4221                                                                         \
4222     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4223         && id == charset_jisx0208)                                      \
4224       {                                                                 \
4225         id = charset_jisx0208_1978;                                     \
4226         charset = CHARSET_FROM_ID (id);                                 \
4227       }                                                                 \
4228                                                                         \
4229     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4230       {                                                                 \
4231         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4232           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4233         else                                                            \
4234           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4235         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4236         break;                                                          \
4237       }                                                                 \
4238     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4239       {                                                                 \
4240         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4241         break;                                                          \
4242       }                                                                 \
4243     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4244       {                                                                 \
4245         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4246         break;                                                          \
4247       }                                                                 \
4248     else                                                                \
4249       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4250          must invoke it, or, at first, designate it to some graphic     \
4251          register.  Then repeat the loop to actually produce the        \
4252          character.  */                                                 \
4253       dst = encode_invocation_designation (charset, coding, dst,        \
4254                                            &produced_chars);            \
4255   } while (1)
4256
4257
4258 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4259   do {                                                                     \
4260     unsigned code;                                                         \
4261     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4262                                                                            \
4263     if (CHARSET_DIMENSION (charset) == 1)                                  \
4264       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4265     else                                                                   \
4266       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4267   } while (0)
4268
4269
4270 /* Produce designation and invocation codes at a place pointed by DST
4271    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4272    Return new DST.  */
4273
4274 static unsigned char *
4275 encode_invocation_designation (struct charset *charset,
4276                                struct coding_system *coding,
4277                                unsigned char *dst, ptrdiff_t *p_nchars)
4278 {
4279   bool multibytep = coding->dst_multibyte;
4280   ptrdiff_t produced_chars = *p_nchars;
4281   int reg;                      /* graphic register number */
4282   int id = CHARSET_ID (charset);
4283
4284   /* At first, check designations.  */
4285   for (reg = 0; reg < 4; reg++)
4286     if (id == CODING_ISO_DESIGNATION (coding, reg))
4287       break;
4288
4289   if (reg >= 4)
4290     {
4291       /* CHARSET is not yet designated to any graphic registers.  */
4292       /* At first check the requested designation.  */
4293       reg = CODING_ISO_REQUEST (coding, id);
4294       if (reg < 0)
4295         /* Since CHARSET requests no special designation, designate it
4296            to graphic register 0.  */
4297         reg = 0;
4298
4299       ENCODE_DESIGNATION (charset, reg, coding);
4300     }
4301
4302   if (CODING_ISO_INVOCATION (coding, 0) != reg
4303       && CODING_ISO_INVOCATION (coding, 1) != reg)
4304     {
4305       /* Since the graphic register REG is not invoked to any graphic
4306          planes, invoke it to graphic plane 0.  */
4307       switch (reg)
4308         {
4309         case 0:                 /* graphic register 0 */
4310           ENCODE_SHIFT_IN;
4311           break;
4312
4313         case 1:                 /* graphic register 1 */
4314           ENCODE_SHIFT_OUT;
4315           break;
4316
4317         case 2:                 /* graphic register 2 */
4318           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4319             ENCODE_SINGLE_SHIFT_2;
4320           else
4321             ENCODE_LOCKING_SHIFT_2;
4322           break;
4323
4324         case 3:                 /* graphic register 3 */
4325           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4326             ENCODE_SINGLE_SHIFT_3;
4327           else
4328             ENCODE_LOCKING_SHIFT_3;
4329           break;
4330         }
4331     }
4332
4333   *p_nchars = produced_chars;
4334   return dst;
4335 }
4336
4337
4338 /* Produce codes for designation and invocation to reset the graphic
4339    planes and registers to initial state.  */
4340 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4341   do {                                                                  \
4342     int reg;                                                            \
4343     struct charset *charset;                                            \
4344                                                                         \
4345     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4346       ENCODE_SHIFT_IN;                                                  \
4347     for (reg = 0; reg < 4; reg++)                                       \
4348       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4349           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4350               != CODING_ISO_INITIAL (coding, reg)))                     \
4351         {                                                               \
4352           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4353           ENCODE_DESIGNATION (charset, reg, coding);                    \
4354         }                                                               \
4355   } while (0)
4356
4357
4358 /* Produce designation sequences of charsets in the line started from
4359    CHARBUF to a place pointed by DST, and return the number of
4360    produced bytes.  DST should not directly point a buffer text area
4361    which may be relocated by char_charset call.
4362
4363    If the current block ends before any end-of-line, we may fail to
4364    find all the necessary designations.  */
4365
4366 static ptrdiff_t
4367 encode_designation_at_bol (struct coding_system *coding,
4368                            int *charbuf, int *charbuf_end,
4369                            unsigned char *dst)
4370 {
4371   unsigned char *orig = dst;
4372   struct charset *charset;
4373   /* Table of charsets to be designated to each graphic register.  */
4374   int r[4];
4375   int c, found = 0, reg;
4376   ptrdiff_t produced_chars = 0;
4377   bool multibytep = coding->dst_multibyte;
4378   Lisp_Object attrs;
4379   Lisp_Object charset_list;
4380
4381   attrs = CODING_ID_ATTRS (coding->id);
4382   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4383   if (EQ (charset_list, Qiso_2022))
4384     charset_list = Viso_2022_charset_list;
4385
4386   for (reg = 0; reg < 4; reg++)
4387     r[reg] = -1;
4388
4389   while (charbuf < charbuf_end && found < 4)
4390     {
4391       int id;
4392
4393       c = *charbuf++;
4394       if (c == '\n')
4395         break;
4396       charset = char_charset (c, charset_list, NULL);
4397       id = CHARSET_ID (charset);
4398       reg = CODING_ISO_REQUEST (coding, id);
4399       if (reg >= 0 && r[reg] < 0)
4400         {
4401           found++;
4402           r[reg] = id;
4403         }
4404     }
4405
4406   if (found)
4407     {
4408       for (reg = 0; reg < 4; reg++)
4409         if (r[reg] >= 0
4410             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4411           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4412     }
4413
4414   return dst - orig;
4415 }
4416
4417 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4418
4419 static bool
4420 encode_coding_iso_2022 (struct coding_system *coding)
4421 {
4422   bool multibytep = coding->dst_multibyte;
4423   int *charbuf = coding->charbuf;
4424   int *charbuf_end = charbuf + coding->charbuf_used;
4425   unsigned char *dst = coding->destination + coding->produced;
4426   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4427   int safe_room = 16;
4428   bool bol_designation
4429     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4430        && CODING_ISO_BOL (coding));
4431   ptrdiff_t produced_chars = 0;
4432   Lisp_Object attrs, eol_type, charset_list;
4433   bool ascii_compatible;
4434   int c;
4435   int preferred_charset_id = -1;
4436
4437   CODING_GET_INFO (coding, attrs, charset_list);
4438   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4439   if (VECTORP (eol_type))
4440     eol_type = Qunix;
4441
4442   setup_iso_safe_charsets (attrs);
4443   /* Charset list may have been changed.  */
4444   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4445   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4446
4447   ascii_compatible
4448     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4449        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4450                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4451
4452   while (charbuf < charbuf_end)
4453     {
4454       ASSURE_DESTINATION (safe_room);
4455
4456       if (bol_designation)
4457         {
4458           /* We have to produce designation sequences if any now.  */
4459           unsigned char desig_buf[16];
4460           ptrdiff_t nbytes;
4461           ptrdiff_t offset;
4462
4463           charset_map_loaded = 0;
4464           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4465                                               desig_buf);
4466           if (charset_map_loaded
4467               && (offset = coding_change_destination (coding)))
4468             {
4469               dst += offset;
4470               dst_end += offset;
4471             }
4472           memcpy (dst, desig_buf, nbytes);
4473           dst += nbytes;
4474           /* We are sure that designation sequences are all ASCII bytes.  */
4475           produced_chars += nbytes;
4476           bol_designation = 0;
4477           ASSURE_DESTINATION (safe_room);
4478         }
4479
4480       c = *charbuf++;
4481
4482       if (c < 0)
4483         {
4484           /* Handle an annotation.  */
4485           switch (*charbuf)
4486             {
4487             case CODING_ANNOTATE_COMPOSITION_MASK:
4488               /* Not yet implemented.  */
4489               break;
4490             case CODING_ANNOTATE_CHARSET_MASK:
4491               preferred_charset_id = charbuf[2];
4492               if (preferred_charset_id >= 0
4493                   && NILP (Fmemq (make_number (preferred_charset_id),
4494                                   charset_list)))
4495                 preferred_charset_id = -1;
4496               break;
4497             default:
4498               emacs_abort ();
4499             }
4500           charbuf += -c - 1;
4501           continue;
4502         }
4503
4504       /* Now encode the character C.  */
4505       if (c < 0x20 || c == 0x7F)
4506         {
4507           if (c == '\n'
4508               || (c == '\r' && EQ (eol_type, Qmac)))
4509             {
4510               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4511                 ENCODE_RESET_PLANE_AND_REGISTER ();
4512               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4513                 {
4514                   int i;
4515
4516                   for (i = 0; i < 4; i++)
4517                     CODING_ISO_DESIGNATION (coding, i)
4518                       = CODING_ISO_INITIAL (coding, i);
4519                 }
4520               bol_designation = ((CODING_ISO_FLAGS (coding)
4521                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4522                                  != 0);
4523             }
4524           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4525             ENCODE_RESET_PLANE_AND_REGISTER ();
4526           EMIT_ONE_ASCII_BYTE (c);
4527         }
4528       else if (ASCII_CHAR_P (c))
4529         {
4530           if (ascii_compatible)
4531             EMIT_ONE_ASCII_BYTE (c);
4532           else
4533             {
4534               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4535               ENCODE_ISO_CHARACTER (charset, c);
4536             }
4537         }
4538       else if (CHAR_BYTE8_P (c))
4539         {
4540           c = CHAR_TO_BYTE8 (c);
4541           EMIT_ONE_BYTE (c);
4542         }
4543       else
4544         {
4545           struct charset *charset;
4546
4547           if (preferred_charset_id >= 0)
4548             {
4549               bool result;
4550
4551               charset = CHARSET_FROM_ID (preferred_charset_id);
4552               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4553               if (! result)
4554                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4555                                      NULL, charset);
4556             }
4557           else
4558             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4559                                  NULL, charset);
4560           if (!charset)
4561             {
4562               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4563                 {
4564                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4565                   charset = CHARSET_FROM_ID (charset_ascii);
4566                 }
4567               else
4568                 {
4569                   c = coding->default_char;
4570                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4571                                        charset_list, NULL, charset);
4572                 }
4573             }
4574           ENCODE_ISO_CHARACTER (charset, c);
4575         }
4576     }
4577
4578   if (coding->mode & CODING_MODE_LAST_BLOCK
4579       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4580     {
4581       ASSURE_DESTINATION (safe_room);
4582       ENCODE_RESET_PLANE_AND_REGISTER ();
4583     }
4584   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4585   CODING_ISO_BOL (coding) = bol_designation;
4586   coding->produced_char += produced_chars;
4587   coding->produced = dst - coding->destination;
4588   return 0;
4589 }
4590
4591 \f
4592 /*** 8,9. SJIS and BIG5 handlers ***/
4593
4594 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4595    quite widely.  So, for the moment, Emacs supports them in the bare
4596    C code.  But, in the future, they may be supported only by CCL.  */
4597
4598 /* SJIS is a coding system encoding three character sets: ASCII, right
4599    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4600    as is.  A character of charset katakana-jisx0201 is encoded by
4601    "position-code + 0x80".  A character of charset japanese-jisx0208
4602    is encoded in 2-byte but two position-codes are divided and shifted
4603    so that it fit in the range below.
4604
4605    --- CODE RANGE of SJIS ---
4606    (character set)      (range)
4607    ASCII                0x00 .. 0x7F
4608    KATAKANA-JISX0201    0xA0 .. 0xDF
4609    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4610             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4611    -------------------------------
4612
4613 */
4614
4615 /* BIG5 is a coding system encoding two character sets: ASCII and
4616    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4617    character set and is encoded in two-byte.
4618
4619    --- CODE RANGE of BIG5 ---
4620    (character set)      (range)
4621    ASCII                0x00 .. 0x7F
4622    Big5 (1st byte)      0xA1 .. 0xFE
4623         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4624    --------------------------
4625
4626   */
4627
4628 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4629    Return true if a text is encoded in SJIS.  */
4630
4631 static bool
4632 detect_coding_sjis (struct coding_system *coding,
4633                     struct coding_detection_info *detect_info)
4634 {
4635   const unsigned char *src = coding->source, *src_base;
4636   const unsigned char *src_end = coding->source + coding->src_bytes;
4637   bool multibytep = coding->src_multibyte;
4638   ptrdiff_t consumed_chars = 0;
4639   int found = 0;
4640   int c;
4641   Lisp_Object attrs, charset_list;
4642   int max_first_byte_of_2_byte_code;
4643
4644   CODING_GET_INFO (coding, attrs, charset_list);
4645   max_first_byte_of_2_byte_code
4646     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4647
4648   detect_info->checked |= CATEGORY_MASK_SJIS;
4649   /* A coding system of this category is always ASCII compatible.  */
4650   src += coding->head_ascii;
4651
4652   while (1)
4653     {
4654       src_base = src;
4655       ONE_MORE_BYTE (c);
4656       if (c < 0x80)
4657         continue;
4658       if ((c >= 0x81 && c <= 0x9F)
4659           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4660         {
4661           ONE_MORE_BYTE (c);
4662           if (c < 0x40 || c == 0x7F || c > 0xFC)
4663             break;
4664           found = CATEGORY_MASK_SJIS;
4665         }
4666       else if (c >= 0xA0 && c < 0xE0)
4667         found = CATEGORY_MASK_SJIS;
4668       else
4669         break;
4670     }
4671   detect_info->rejected |= CATEGORY_MASK_SJIS;
4672   return 0;
4673
4674  no_more_source:
4675   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4676     {
4677       detect_info->rejected |= CATEGORY_MASK_SJIS;
4678       return 0;
4679     }
4680   detect_info->found |= found;
4681   return 1;
4682 }
4683
4684 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4685    Return true if a text is encoded in BIG5.  */
4686
4687 static bool
4688 detect_coding_big5 (struct coding_system *coding,
4689                     struct coding_detection_info *detect_info)
4690 {
4691   const unsigned char *src = coding->source, *src_base;
4692   const unsigned char *src_end = coding->source + coding->src_bytes;
4693   bool multibytep = coding->src_multibyte;
4694   ptrdiff_t consumed_chars = 0;
4695   int found = 0;
4696   int c;
4697
4698   detect_info->checked |= CATEGORY_MASK_BIG5;
4699   /* A coding system of this category is always ASCII compatible.  */
4700   src += coding->head_ascii;
4701
4702   while (1)
4703     {
4704       src_base = src;
4705       ONE_MORE_BYTE (c);
4706       if (c < 0x80)
4707         continue;
4708       if (c >= 0xA1)
4709         {
4710           ONE_MORE_BYTE (c);
4711           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4712             return 0;
4713           found = CATEGORY_MASK_BIG5;
4714         }
4715       else
4716         break;
4717     }
4718   detect_info->rejected |= CATEGORY_MASK_BIG5;
4719   return 0;
4720
4721  no_more_source:
4722   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4723     {
4724       detect_info->rejected |= CATEGORY_MASK_BIG5;
4725       return 0;
4726     }
4727   detect_info->found |= found;
4728   return 1;
4729 }
4730
4731 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4732
4733 static void
4734 decode_coding_sjis (struct coding_system *coding)
4735 {
4736   const unsigned char *src = coding->source + coding->consumed;
4737   const unsigned char *src_end = coding->source + coding->src_bytes;
4738   const unsigned char *src_base;
4739   int *charbuf = coding->charbuf + coding->charbuf_used;
4740   /* We may produce one charset annotation in one loop and one more at
4741      the end.  */
4742   int *charbuf_end
4743     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4744   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4745   bool multibytep = coding->src_multibyte;
4746   struct charset *charset_roman, *charset_kanji, *charset_kana;
4747   struct charset *charset_kanji2;
4748   Lisp_Object attrs, charset_list, val;
4749   ptrdiff_t char_offset = coding->produced_char;
4750   ptrdiff_t last_offset = char_offset;
4751   int last_id = charset_ascii;
4752   bool eol_dos
4753     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4754   int byte_after_cr = -1;
4755
4756   CODING_GET_INFO (coding, attrs, charset_list);
4757
4758   val = charset_list;
4759   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4760   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4761   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4762   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4763
4764   while (1)
4765     {
4766       int c, c1;
4767       struct charset *charset;
4768
4769       src_base = src;
4770       consumed_chars_base = consumed_chars;
4771
4772       if (charbuf >= charbuf_end)
4773         {
4774           if (byte_after_cr >= 0)
4775             src_base--;
4776           break;
4777         }
4778
4779       if (byte_after_cr >= 0)
4780         c = byte_after_cr, byte_after_cr = -1;
4781       else
4782         ONE_MORE_BYTE (c);
4783       if (c < 0)
4784         goto invalid_code;
4785       if (c < 0x80)
4786         {
4787           if (eol_dos && c == '\r')
4788             ONE_MORE_BYTE (byte_after_cr);
4789           charset = charset_roman;
4790         }
4791       else if (c == 0x80 || c == 0xA0)
4792         goto invalid_code;
4793       else if (c >= 0xA1 && c <= 0xDF)
4794         {
4795           /* SJIS -> JISX0201-Kana */
4796           c &= 0x7F;
4797           charset = charset_kana;
4798         }
4799       else if (c <= 0xEF)
4800         {
4801           /* SJIS -> JISX0208 */
4802           ONE_MORE_BYTE (c1);
4803           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4804             goto invalid_code;
4805           c = (c << 8) | c1;
4806           SJIS_TO_JIS (c);
4807           charset = charset_kanji;
4808         }
4809       else if (c <= 0xFC && charset_kanji2)
4810         {
4811           /* SJIS -> JISX0213-2 */
4812           ONE_MORE_BYTE (c1);
4813           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4814             goto invalid_code;
4815           c = (c << 8) | c1;
4816           SJIS_TO_JIS2 (c);
4817           charset = charset_kanji2;
4818         }
4819       else
4820         goto invalid_code;
4821       if (charset->id != charset_ascii
4822           && last_id != charset->id)
4823         {
4824           if (last_id != charset_ascii)
4825             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4826           last_id = charset->id;
4827           last_offset = char_offset;
4828         }
4829       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4830       *charbuf++ = c;
4831       char_offset++;
4832       continue;
4833
4834     invalid_code:
4835       src = src_base;
4836       consumed_chars = consumed_chars_base;
4837       ONE_MORE_BYTE (c);
4838       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4839       char_offset++;
4840       coding->errors++;
4841     }
4842
4843  no_more_source:
4844   if (last_id != charset_ascii)
4845     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4846   coding->consumed_char += consumed_chars_base;
4847   coding->consumed = src_base - coding->source;
4848   coding->charbuf_used = charbuf - coding->charbuf;
4849 }
4850
4851 static void
4852 decode_coding_big5 (struct coding_system *coding)
4853 {
4854   const unsigned char *src = coding->source + coding->consumed;
4855   const unsigned char *src_end = coding->source + coding->src_bytes;
4856   const unsigned char *src_base;
4857   int *charbuf = coding->charbuf + coding->charbuf_used;
4858   /* We may produce one charset annotation in one loop and one more at
4859      the end.  */
4860   int *charbuf_end
4861     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4862   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4863   bool multibytep = coding->src_multibyte;
4864   struct charset *charset_roman, *charset_big5;
4865   Lisp_Object attrs, charset_list, val;
4866   ptrdiff_t char_offset = coding->produced_char;
4867   ptrdiff_t last_offset = char_offset;
4868   int last_id = charset_ascii;
4869   bool eol_dos
4870     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4871   int byte_after_cr = -1;
4872
4873   CODING_GET_INFO (coding, attrs, charset_list);
4874   val = charset_list;
4875   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4876   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4877
4878   while (1)
4879     {
4880       int c, c1;
4881       struct charset *charset;
4882
4883       src_base = src;
4884       consumed_chars_base = consumed_chars;
4885
4886       if (charbuf >= charbuf_end)
4887         {
4888           if (byte_after_cr >= 0)
4889             src_base--;
4890           break;
4891         }
4892
4893       if (byte_after_cr >= 0)
4894         c = byte_after_cr, byte_after_cr = -1;
4895       else
4896         ONE_MORE_BYTE (c);
4897
4898       if (c < 0)
4899         goto invalid_code;
4900       if (c < 0x80)
4901         {
4902           if (eol_dos && c == '\r')
4903             ONE_MORE_BYTE (byte_after_cr);
4904           charset = charset_roman;
4905         }
4906       else
4907         {
4908           /* BIG5 -> Big5 */
4909           if (c < 0xA1 || c > 0xFE)
4910             goto invalid_code;
4911           ONE_MORE_BYTE (c1);
4912           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4913             goto invalid_code;
4914           c = c << 8 | c1;
4915           charset = charset_big5;
4916         }
4917       if (charset->id != charset_ascii
4918           && last_id != charset->id)
4919         {
4920           if (last_id != charset_ascii)
4921             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4922           last_id = charset->id;
4923           last_offset = char_offset;
4924         }
4925       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4926       *charbuf++ = c;
4927       char_offset++;
4928       continue;
4929
4930     invalid_code:
4931       src = src_base;
4932       consumed_chars = consumed_chars_base;
4933       ONE_MORE_BYTE (c);
4934       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4935       char_offset++;
4936       coding->errors++;
4937     }
4938
4939  no_more_source:
4940   if (last_id != charset_ascii)
4941     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4942   coding->consumed_char += consumed_chars_base;
4943   coding->consumed = src_base - coding->source;
4944   coding->charbuf_used = charbuf - coding->charbuf;
4945 }
4946
4947 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4948    This function can encode charsets `ascii', `katakana-jisx0201',
4949    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4950    are sure that all these charsets are registered as official charset
4951    (i.e. do not have extended leading-codes).  Characters of other
4952    charsets are produced without any encoding.  */
4953
4954 static bool
4955 encode_coding_sjis (struct coding_system *coding)
4956 {
4957   bool multibytep = coding->dst_multibyte;
4958   int *charbuf = coding->charbuf;
4959   int *charbuf_end = charbuf + coding->charbuf_used;
4960   unsigned char *dst = coding->destination + coding->produced;
4961   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4962   int safe_room = 4;
4963   ptrdiff_t produced_chars = 0;
4964   Lisp_Object attrs, charset_list, val;
4965   bool ascii_compatible;
4966   struct charset *charset_kanji, *charset_kana;
4967   struct charset *charset_kanji2;
4968   int c;
4969
4970   CODING_GET_INFO (coding, attrs, charset_list);
4971   val = XCDR (charset_list);
4972   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4973   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4974   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4975
4976   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4977
4978   while (charbuf < charbuf_end)
4979     {
4980       ASSURE_DESTINATION (safe_room);
4981       c = *charbuf++;
4982       /* Now encode the character C.  */
4983       if (ASCII_CHAR_P (c) && ascii_compatible)
4984         EMIT_ONE_ASCII_BYTE (c);
4985       else if (CHAR_BYTE8_P (c))
4986         {
4987           c = CHAR_TO_BYTE8 (c);
4988           EMIT_ONE_BYTE (c);
4989         }
4990       else
4991         {
4992           unsigned code;
4993           struct charset *charset;
4994           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4995                                &code, charset);
4996
4997           if (!charset)
4998             {
4999               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5000                 {
5001                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5002                   charset = CHARSET_FROM_ID (charset_ascii);
5003                 }
5004               else
5005                 {
5006                   c = coding->default_char;
5007                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5008                                        charset_list, &code, charset);
5009                 }
5010             }
5011           if (code == CHARSET_INVALID_CODE (charset))
5012             emacs_abort ();
5013           if (charset == charset_kanji)
5014             {
5015               int c1, c2;
5016               JIS_TO_SJIS (code);
5017               c1 = code >> 8, c2 = code & 0xFF;
5018               EMIT_TWO_BYTES (c1, c2);
5019             }
5020           else if (charset == charset_kana)
5021             EMIT_ONE_BYTE (code | 0x80);
5022           else if (charset_kanji2 && charset == charset_kanji2)
5023             {
5024               int c1, c2;
5025
5026               c1 = code >> 8;
5027               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5028                   || c1 == 0x28
5029                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5030                 {
5031                   JIS_TO_SJIS2 (code);
5032                   c1 = code >> 8, c2 = code & 0xFF;
5033                   EMIT_TWO_BYTES (c1, c2);
5034                 }
5035               else
5036                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5037             }
5038           else
5039             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5040         }
5041     }
5042   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5043   coding->produced_char += produced_chars;
5044   coding->produced = dst - coding->destination;
5045   return 0;
5046 }
5047
5048 static bool
5049 encode_coding_big5 (struct coding_system *coding)
5050 {
5051   bool multibytep = coding->dst_multibyte;
5052   int *charbuf = coding->charbuf;
5053   int *charbuf_end = charbuf + coding->charbuf_used;
5054   unsigned char *dst = coding->destination + coding->produced;
5055   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5056   int safe_room = 4;
5057   ptrdiff_t produced_chars = 0;
5058   Lisp_Object attrs, charset_list, val;
5059   bool ascii_compatible;
5060   struct charset *charset_big5;
5061   int c;
5062
5063   CODING_GET_INFO (coding, attrs, charset_list);
5064   val = XCDR (charset_list);
5065   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5066   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5067
5068   while (charbuf < charbuf_end)
5069     {
5070       ASSURE_DESTINATION (safe_room);
5071       c = *charbuf++;
5072       /* Now encode the character C.  */
5073       if (ASCII_CHAR_P (c) && ascii_compatible)
5074         EMIT_ONE_ASCII_BYTE (c);
5075       else if (CHAR_BYTE8_P (c))
5076         {
5077           c = CHAR_TO_BYTE8 (c);
5078           EMIT_ONE_BYTE (c);
5079         }
5080       else
5081         {
5082           unsigned code;
5083           struct charset *charset;
5084           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5085                                &code, charset);
5086
5087           if (! charset)
5088             {
5089               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5090                 {
5091                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5092                   charset = CHARSET_FROM_ID (charset_ascii);
5093                 }
5094               else
5095                 {
5096                   c = coding->default_char;
5097                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5098                                        charset_list, &code, charset);
5099                 }
5100             }
5101           if (code == CHARSET_INVALID_CODE (charset))
5102             emacs_abort ();
5103           if (charset == charset_big5)
5104             {
5105               int c1, c2;
5106
5107               c1 = code >> 8, c2 = code & 0xFF;
5108               EMIT_TWO_BYTES (c1, c2);
5109             }
5110           else
5111             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5112         }
5113     }
5114   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5115   coding->produced_char += produced_chars;
5116   coding->produced = dst - coding->destination;
5117   return 0;
5118 }
5119
5120 \f
5121 /*** 10. CCL handlers ***/
5122
5123 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5124    Return true if a text is encoded in a coding system of which
5125    encoder/decoder are written in CCL program.  */
5126
5127 static bool
5128 detect_coding_ccl (struct coding_system *coding,
5129                    struct coding_detection_info *detect_info)
5130 {
5131   const unsigned char *src = coding->source, *src_base;
5132   const unsigned char *src_end = coding->source + coding->src_bytes;
5133   bool multibytep = coding->src_multibyte;
5134   ptrdiff_t consumed_chars = 0;
5135   int found = 0;
5136   unsigned char *valids;
5137   ptrdiff_t head_ascii = coding->head_ascii;
5138   Lisp_Object attrs;
5139
5140   detect_info->checked |= CATEGORY_MASK_CCL;
5141
5142   coding = &coding_categories[coding_category_ccl];
5143   valids = CODING_CCL_VALIDS (coding);
5144   attrs = CODING_ID_ATTRS (coding->id);
5145   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5146     src += head_ascii;
5147
5148   while (1)
5149     {
5150       int c;
5151
5152       src_base = src;
5153       ONE_MORE_BYTE (c);
5154       if (c < 0 || ! valids[c])
5155         break;
5156       if ((valids[c] > 1))
5157         found = CATEGORY_MASK_CCL;
5158     }
5159   detect_info->rejected |= CATEGORY_MASK_CCL;
5160   return 0;
5161
5162  no_more_source:
5163   detect_info->found |= found;
5164   return 1;
5165 }
5166
5167 static void
5168 decode_coding_ccl (struct coding_system *coding)
5169 {
5170   const unsigned char *src = coding->source + coding->consumed;
5171   const unsigned char *src_end = coding->source + coding->src_bytes;
5172   int *charbuf = coding->charbuf + coding->charbuf_used;
5173   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5174   ptrdiff_t consumed_chars = 0;
5175   bool multibytep = coding->src_multibyte;
5176   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5177   int source_charbuf[1024];
5178   int source_byteidx[1025];
5179   Lisp_Object attrs, charset_list;
5180
5181   CODING_GET_INFO (coding, attrs, charset_list);
5182
5183   while (1)
5184     {
5185       const unsigned char *p = src;
5186       ptrdiff_t offset;
5187       int i = 0;
5188
5189       if (multibytep)
5190         {
5191           while (i < 1024 && p < src_end)
5192             {
5193               source_byteidx[i] = p - src;
5194               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5195             }
5196           source_byteidx[i] = p - src;
5197         }
5198       else
5199         while (i < 1024 && p < src_end)
5200           source_charbuf[i++] = *p++;
5201
5202       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5203         ccl->last_block = true;
5204       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5205       charset_map_loaded = 0;
5206       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5207                   charset_list);
5208       if (charset_map_loaded
5209           && (offset = coding_change_source (coding)))
5210         {
5211           p += offset;
5212           src += offset;
5213           src_end += offset;
5214         }
5215       charbuf += ccl->produced;
5216       if (multibytep)
5217         src += source_byteidx[ccl->consumed];
5218       else
5219         src += ccl->consumed;
5220       consumed_chars += ccl->consumed;
5221       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5222         break;
5223     }
5224
5225   switch (ccl->status)
5226     {
5227     case CCL_STAT_SUSPEND_BY_SRC:
5228       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5229       break;
5230     case CCL_STAT_SUSPEND_BY_DST:
5231       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5232       break;
5233     case CCL_STAT_QUIT:
5234     case CCL_STAT_INVALID_CMD:
5235       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5236       break;
5237     default:
5238       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5239       break;
5240     }
5241   coding->consumed_char += consumed_chars;
5242   coding->consumed = src - coding->source;
5243   coding->charbuf_used = charbuf - coding->charbuf;
5244 }
5245
5246 static bool
5247 encode_coding_ccl (struct coding_system *coding)
5248 {
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   bool multibytep = coding->dst_multibyte;
5251   int *charbuf = coding->charbuf;
5252   int *charbuf_end = charbuf + coding->charbuf_used;
5253   unsigned char *dst = coding->destination + coding->produced;
5254   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5255   int destination_charbuf[1024];
5256   ptrdiff_t produced_chars = 0;
5257   int i;
5258   Lisp_Object attrs, charset_list;
5259
5260   CODING_GET_INFO (coding, attrs, charset_list);
5261   if (coding->consumed_char == coding->src_chars
5262       && coding->mode & CODING_MODE_LAST_BLOCK)
5263     ccl->last_block = true;
5264
5265   do
5266     {
5267       ptrdiff_t offset;
5268
5269       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5270       charset_map_loaded = 0;
5271       ccl_driver (ccl, charbuf, destination_charbuf,
5272                   charbuf_end - charbuf, 1024, charset_list);
5273       if (charset_map_loaded
5274           && (offset = coding_change_destination (coding)))
5275         dst += offset;
5276       if (multibytep)
5277         {
5278           ASSURE_DESTINATION (ccl->produced * 2);
5279           for (i = 0; i < ccl->produced; i++)
5280             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5281         }
5282       else
5283         {
5284           ASSURE_DESTINATION (ccl->produced);
5285           for (i = 0; i < ccl->produced; i++)
5286             *dst++ = destination_charbuf[i] & 0xFF;
5287           produced_chars += ccl->produced;
5288         }
5289       charbuf += ccl->consumed;
5290       if (ccl->status == CCL_STAT_QUIT
5291           || ccl->status == CCL_STAT_INVALID_CMD)
5292         break;
5293     }
5294   while (charbuf < charbuf_end);
5295
5296   switch (ccl->status)
5297     {
5298     case CCL_STAT_SUSPEND_BY_SRC:
5299       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5300       break;
5301     case CCL_STAT_SUSPEND_BY_DST:
5302       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5303       break;
5304     case CCL_STAT_QUIT:
5305     case CCL_STAT_INVALID_CMD:
5306       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5307       break;
5308     default:
5309       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5310       break;
5311     }
5312
5313   coding->produced_char += produced_chars;
5314   coding->produced = dst - coding->destination;
5315   return 0;
5316 }
5317
5318 \f
5319 /*** 10, 11. no-conversion handlers ***/
5320
5321 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5322
5323 static void
5324 decode_coding_raw_text (struct coding_system *coding)
5325 {
5326   bool eol_dos
5327     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5328
5329   coding->chars_at_source = 1;
5330   coding->consumed_char = coding->src_chars;
5331   coding->consumed = coding->src_bytes;
5332   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5333     {
5334       coding->consumed_char--;
5335       coding->consumed--;
5336       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5337     }
5338   else
5339     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5340 }
5341
5342 static bool
5343 encode_coding_raw_text (struct coding_system *coding)
5344 {
5345   bool multibytep = coding->dst_multibyte;
5346   int *charbuf = coding->charbuf;
5347   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5348   unsigned char *dst = coding->destination + coding->produced;
5349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5350   ptrdiff_t produced_chars = 0;
5351   int c;
5352
5353   if (multibytep)
5354     {
5355       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5356
5357       if (coding->src_multibyte)
5358         while (charbuf < charbuf_end)
5359           {
5360             ASSURE_DESTINATION (safe_room);
5361             c = *charbuf++;
5362             if (ASCII_CHAR_P (c))
5363               EMIT_ONE_ASCII_BYTE (c);
5364             else if (CHAR_BYTE8_P (c))
5365               {
5366                 c = CHAR_TO_BYTE8 (c);
5367                 EMIT_ONE_BYTE (c);
5368               }
5369             else
5370               {
5371                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5372
5373                 CHAR_STRING_ADVANCE (c, p1);
5374                 do
5375                   {
5376                     EMIT_ONE_BYTE (*p0);
5377                     p0++;
5378                   }
5379                 while (p0 < p1);
5380               }
5381           }
5382       else
5383         while (charbuf < charbuf_end)
5384           {
5385             ASSURE_DESTINATION (safe_room);
5386             c = *charbuf++;
5387             EMIT_ONE_BYTE (c);
5388           }
5389     }
5390   else
5391     {
5392       if (coding->src_multibyte)
5393         {
5394           int safe_room = MAX_MULTIBYTE_LENGTH;
5395
5396           while (charbuf < charbuf_end)
5397             {
5398               ASSURE_DESTINATION (safe_room);
5399               c = *charbuf++;
5400               if (ASCII_CHAR_P (c))
5401                 *dst++ = c;
5402               else if (CHAR_BYTE8_P (c))
5403                 *dst++ = CHAR_TO_BYTE8 (c);
5404               else
5405                 CHAR_STRING_ADVANCE (c, dst);
5406             }
5407         }
5408       else
5409         {
5410           ASSURE_DESTINATION (charbuf_end - charbuf);
5411           while (charbuf < charbuf_end && dst < dst_end)
5412             *dst++ = *charbuf++;
5413         }
5414       produced_chars = dst - (coding->destination + coding->produced);
5415     }
5416   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5417   coding->produced_char += produced_chars;
5418   coding->produced = dst - coding->destination;
5419   return 0;
5420 }
5421
5422 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5423    Return true if a text is encoded in a charset-based coding system.  */
5424
5425 static bool
5426 detect_coding_charset (struct coding_system *coding,
5427                        struct coding_detection_info *detect_info)
5428 {
5429   const unsigned char *src = coding->source, *src_base;
5430   const unsigned char *src_end = coding->source + coding->src_bytes;
5431   bool multibytep = coding->src_multibyte;
5432   ptrdiff_t consumed_chars = 0;
5433   Lisp_Object attrs, valids, name;
5434   int found = 0;
5435   ptrdiff_t head_ascii = coding->head_ascii;
5436   bool check_latin_extra = 0;
5437
5438   detect_info->checked |= CATEGORY_MASK_CHARSET;
5439
5440   coding = &coding_categories[coding_category_charset];
5441   attrs = CODING_ID_ATTRS (coding->id);
5442   valids = AREF (attrs, coding_attr_charset_valids);
5443   name = CODING_ID_NAME (coding->id);
5444   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5445                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5446       || strncmp (SSDATA (SYMBOL_NAME (name)),
5447                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5448     check_latin_extra = 1;
5449
5450   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5451     src += head_ascii;
5452
5453   while (1)
5454     {
5455       int c;
5456       Lisp_Object val;
5457       struct charset *charset;
5458       int dim, idx;
5459
5460       src_base = src;
5461       ONE_MORE_BYTE (c);
5462       if (c < 0)
5463         continue;
5464       val = AREF (valids, c);
5465       if (NILP (val))
5466         break;
5467       if (c >= 0x80)
5468         {
5469           if (c < 0xA0
5470               && check_latin_extra
5471               && (!VECTORP (Vlatin_extra_code_table)
5472                   || NILP (AREF (Vlatin_extra_code_table, c))))
5473             break;
5474           found = CATEGORY_MASK_CHARSET;
5475         }
5476       if (INTEGERP (val))
5477         {
5478           charset = CHARSET_FROM_ID (XFASTINT (val));
5479           dim = CHARSET_DIMENSION (charset);
5480           for (idx = 1; idx < dim; idx++)
5481             {
5482               if (src == src_end)
5483                 goto too_short;
5484               ONE_MORE_BYTE (c);
5485               if (c < charset->code_space[(dim - 1 - idx) * 4]
5486                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5487                 break;
5488             }
5489           if (idx < dim)
5490             break;
5491         }
5492       else
5493         {
5494           idx = 1;
5495           for (; CONSP (val); val = XCDR (val))
5496             {
5497               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5498               dim = CHARSET_DIMENSION (charset);
5499               while (idx < dim)
5500                 {
5501                   if (src == src_end)
5502                     goto too_short;
5503                   ONE_MORE_BYTE (c);
5504                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5505                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5506                     break;
5507                   idx++;
5508                 }
5509               if (idx == dim)
5510                 {
5511                   val = Qnil;
5512                   break;
5513                 }
5514             }
5515           if (CONSP (val))
5516             break;
5517         }
5518     }
5519  too_short:
5520   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5521   return 0;
5522
5523  no_more_source:
5524   detect_info->found |= found;
5525   return 1;
5526 }
5527
5528 static void
5529 decode_coding_charset (struct coding_system *coding)
5530 {
5531   const unsigned char *src = coding->source + coding->consumed;
5532   const unsigned char *src_end = coding->source + coding->src_bytes;
5533   const unsigned char *src_base;
5534   int *charbuf = coding->charbuf + coding->charbuf_used;
5535   /* We may produce one charset annotation in one loop and one more at
5536      the end.  */
5537   int *charbuf_end
5538     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5539   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5540   bool multibytep = coding->src_multibyte;
5541   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5542   Lisp_Object valids;
5543   ptrdiff_t char_offset = coding->produced_char;
5544   ptrdiff_t last_offset = char_offset;
5545   int last_id = charset_ascii;
5546   bool eol_dos
5547     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5548   int byte_after_cr = -1;
5549
5550   valids = AREF (attrs, coding_attr_charset_valids);
5551
5552   while (1)
5553     {
5554       int c;
5555       Lisp_Object val;
5556       struct charset *charset;
5557       int dim;
5558       int len = 1;
5559       unsigned code;
5560
5561       src_base = src;
5562       consumed_chars_base = consumed_chars;
5563
5564       if (charbuf >= charbuf_end)
5565         {
5566           if (byte_after_cr >= 0)
5567             src_base--;
5568           break;
5569         }
5570
5571       if (byte_after_cr >= 0)
5572         {
5573           c = byte_after_cr;
5574           byte_after_cr = -1;
5575         }
5576       else
5577         {
5578           ONE_MORE_BYTE (c);
5579           if (eol_dos && c == '\r')
5580             ONE_MORE_BYTE (byte_after_cr);
5581         }
5582       if (c < 0)
5583         goto invalid_code;
5584       code = c;
5585
5586       val = AREF (valids, c);
5587       if (! INTEGERP (val) && ! CONSP (val))
5588         goto invalid_code;
5589       if (INTEGERP (val))
5590         {
5591           charset = CHARSET_FROM_ID (XFASTINT (val));
5592           dim = CHARSET_DIMENSION (charset);
5593           while (len < dim)
5594             {
5595               ONE_MORE_BYTE (c);
5596               code = (code << 8) | c;
5597               len++;
5598             }
5599           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5600                               charset, code, c);
5601         }
5602       else
5603         {
5604           /* VAL is a list of charset IDs.  It is assured that the
5605              list is sorted by charset dimensions (smaller one
5606              comes first).  */
5607           while (CONSP (val))
5608             {
5609               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5610               dim = CHARSET_DIMENSION (charset);
5611               while (len < dim)
5612                 {
5613                   ONE_MORE_BYTE (c);
5614                   code = (code << 8) | c;
5615                   len++;
5616                 }
5617               CODING_DECODE_CHAR (coding, src, src_base,
5618                                   src_end, charset, code, c);
5619               if (c >= 0)
5620                 break;
5621               val = XCDR (val);
5622             }
5623         }
5624       if (c < 0)
5625         goto invalid_code;
5626       if (charset->id != charset_ascii
5627           && last_id != charset->id)
5628         {
5629           if (last_id != charset_ascii)
5630             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5631           last_id = charset->id;
5632           last_offset = char_offset;
5633         }
5634
5635       *charbuf++ = c;
5636       char_offset++;
5637       continue;
5638
5639     invalid_code:
5640       src = src_base;
5641       consumed_chars = consumed_chars_base;
5642       ONE_MORE_BYTE (c);
5643       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5644       char_offset++;
5645       coding->errors++;
5646     }
5647
5648  no_more_source:
5649   if (last_id != charset_ascii)
5650     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5651   coding->consumed_char += consumed_chars_base;
5652   coding->consumed = src_base - coding->source;
5653   coding->charbuf_used = charbuf - coding->charbuf;
5654 }
5655
5656 static bool
5657 encode_coding_charset (struct coding_system *coding)
5658 {
5659   bool multibytep = coding->dst_multibyte;
5660   int *charbuf = coding->charbuf;
5661   int *charbuf_end = charbuf + coding->charbuf_used;
5662   unsigned char *dst = coding->destination + coding->produced;
5663   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5664   int safe_room = MAX_MULTIBYTE_LENGTH;
5665   ptrdiff_t produced_chars = 0;
5666   Lisp_Object attrs, charset_list;
5667   bool ascii_compatible;
5668   int c;
5669
5670   CODING_GET_INFO (coding, attrs, charset_list);
5671   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5672
5673   while (charbuf < charbuf_end)
5674     {
5675       struct charset *charset;
5676       unsigned code;
5677
5678       ASSURE_DESTINATION (safe_room);
5679       c = *charbuf++;
5680       if (ascii_compatible && ASCII_CHAR_P (c))
5681         EMIT_ONE_ASCII_BYTE (c);
5682       else if (CHAR_BYTE8_P (c))
5683         {
5684           c = CHAR_TO_BYTE8 (c);
5685           EMIT_ONE_BYTE (c);
5686         }
5687       else
5688         {
5689           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5690                                &code, charset);
5691
5692           if (charset)
5693             {
5694               if (CHARSET_DIMENSION (charset) == 1)
5695                 EMIT_ONE_BYTE (code);
5696               else if (CHARSET_DIMENSION (charset) == 2)
5697                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5698               else if (CHARSET_DIMENSION (charset) == 3)
5699                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5700               else
5701                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5702                                  (code >> 8) & 0xFF, code & 0xFF);
5703             }
5704           else
5705             {
5706               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5707                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5708               else
5709                 c = coding->default_char;
5710               EMIT_ONE_BYTE (c);
5711             }
5712         }
5713     }
5714
5715   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5716   coding->produced_char += produced_chars;
5717   coding->produced = dst - coding->destination;
5718   return 0;
5719 }
5720
5721 \f
5722 /*** 7. C library functions ***/
5723
5724 /* Setup coding context CODING from information about CODING_SYSTEM.
5725    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5726    CODING_SYSTEM is invalid, signal an error.  */
5727
5728 void
5729 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5730 {
5731   Lisp_Object attrs;
5732   Lisp_Object eol_type;
5733   Lisp_Object coding_type;
5734   Lisp_Object val;
5735
5736   if (NILP (coding_system))
5737     coding_system = Qundecided;
5738
5739   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5740
5741   attrs = CODING_ID_ATTRS (coding->id);
5742   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5743
5744   coding->mode = 0;
5745   if (VECTORP (eol_type))
5746     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5747                             | CODING_REQUIRE_DETECTION_MASK);
5748   else if (! EQ (eol_type, Qunix))
5749     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5750                             | CODING_REQUIRE_ENCODING_MASK);
5751   else
5752     coding->common_flags = 0;
5753   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5754     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5755   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5756     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5757   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5758     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5759
5760   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5761   coding->max_charset_id = SCHARS (val) - 1;
5762   coding->safe_charsets = SDATA (val);
5763   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5764   coding->carryover_bytes = 0;
5765   coding->raw_destination = 0;
5766
5767   coding_type = CODING_ATTR_TYPE (attrs);
5768   if (EQ (coding_type, Qundecided))
5769     {
5770       coding->detector = NULL;
5771       coding->decoder = decode_coding_raw_text;
5772       coding->encoder = encode_coding_raw_text;
5773       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5774       coding->spec.undecided.inhibit_nbd
5775         = (encode_inhibit_flag
5776            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5777       coding->spec.undecided.inhibit_ied
5778         = (encode_inhibit_flag
5779            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5780       coding->spec.undecided.prefer_utf_8
5781         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5782     }
5783   else if (EQ (coding_type, Qiso_2022))
5784     {
5785       int i;
5786       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5787
5788       /* Invoke graphic register 0 to plane 0.  */
5789       CODING_ISO_INVOCATION (coding, 0) = 0;
5790       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5791       CODING_ISO_INVOCATION (coding, 1)
5792         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5793       /* Setup the initial status of designation.  */
5794       for (i = 0; i < 4; i++)
5795         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5796       /* Not single shifting initially.  */
5797       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5798       /* Beginning of buffer should also be regarded as bol. */
5799       CODING_ISO_BOL (coding) = 1;
5800       coding->detector = detect_coding_iso_2022;
5801       coding->decoder = decode_coding_iso_2022;
5802       coding->encoder = encode_coding_iso_2022;
5803       if (flags & CODING_ISO_FLAG_SAFE)
5804         coding->mode |= CODING_MODE_SAFE_ENCODING;
5805       coding->common_flags
5806         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5807             | CODING_REQUIRE_FLUSHING_MASK);
5808       if (flags & CODING_ISO_FLAG_COMPOSITION)
5809         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5810       if (flags & CODING_ISO_FLAG_DESIGNATION)
5811         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5812       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5813         {
5814           setup_iso_safe_charsets (attrs);
5815           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5816           coding->max_charset_id = SCHARS (val) - 1;
5817           coding->safe_charsets = SDATA (val);
5818         }
5819       CODING_ISO_FLAGS (coding) = flags;
5820       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5821       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5822       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5823       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5824     }
5825   else if (EQ (coding_type, Qcharset))
5826     {
5827       coding->detector = detect_coding_charset;
5828       coding->decoder = decode_coding_charset;
5829       coding->encoder = encode_coding_charset;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832     }
5833   else if (EQ (coding_type, Qutf_8))
5834     {
5835       val = AREF (attrs, coding_attr_utf_bom);
5836       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5837                                    : EQ (val, Qt) ? utf_with_bom
5838                                    : utf_without_bom);
5839       coding->detector = detect_coding_utf_8;
5840       coding->decoder = decode_coding_utf_8;
5841       coding->encoder = encode_coding_utf_8;
5842       coding->common_flags
5843         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5844       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5845         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5846     }
5847   else if (EQ (coding_type, Qutf_16))
5848     {
5849       val = AREF (attrs, coding_attr_utf_bom);
5850       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5851                                     : EQ (val, Qt) ? utf_with_bom
5852                                     : utf_without_bom);
5853       val = AREF (attrs, coding_attr_utf_16_endian);
5854       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5855                                        : utf_16_little_endian);
5856       CODING_UTF_16_SURROGATE (coding) = 0;
5857       coding->detector = detect_coding_utf_16;
5858       coding->decoder = decode_coding_utf_16;
5859       coding->encoder = encode_coding_utf_16;
5860       coding->common_flags
5861         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5862       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5863         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5864     }
5865   else if (EQ (coding_type, Qccl))
5866     {
5867       coding->detector = detect_coding_ccl;
5868       coding->decoder = decode_coding_ccl;
5869       coding->encoder = encode_coding_ccl;
5870       coding->common_flags
5871         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5872             | CODING_REQUIRE_FLUSHING_MASK);
5873     }
5874   else if (EQ (coding_type, Qemacs_mule))
5875     {
5876       coding->detector = detect_coding_emacs_mule;
5877       coding->decoder = decode_coding_emacs_mule;
5878       coding->encoder = encode_coding_emacs_mule;
5879       coding->common_flags
5880         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5881       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5882           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5883         {
5884           Lisp_Object tail, safe_charsets;
5885           int max_charset_id = 0;
5886
5887           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5888                tail = XCDR (tail))
5889             if (max_charset_id < XFASTINT (XCAR (tail)))
5890               max_charset_id = XFASTINT (XCAR (tail));
5891           safe_charsets = make_uninit_string (max_charset_id + 1);
5892           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5893           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5894                tail = XCDR (tail))
5895             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5896           coding->max_charset_id = max_charset_id;
5897           coding->safe_charsets = SDATA (safe_charsets);
5898         }
5899       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5900       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5901     }
5902   else if (EQ (coding_type, Qshift_jis))
5903     {
5904       coding->detector = detect_coding_sjis;
5905       coding->decoder = decode_coding_sjis;
5906       coding->encoder = encode_coding_sjis;
5907       coding->common_flags
5908         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5909     }
5910   else if (EQ (coding_type, Qbig5))
5911     {
5912       coding->detector = detect_coding_big5;
5913       coding->decoder = decode_coding_big5;
5914       coding->encoder = encode_coding_big5;
5915       coding->common_flags
5916         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5917     }
5918   else                          /* EQ (coding_type, Qraw_text) */
5919     {
5920       coding->detector = NULL;
5921       coding->decoder = decode_coding_raw_text;
5922       coding->encoder = encode_coding_raw_text;
5923       if (! EQ (eol_type, Qunix))
5924         {
5925           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5926           if (! VECTORP (eol_type))
5927             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5928         }
5929
5930     }
5931
5932   return;
5933 }
5934
5935 /* Return a list of charsets supported by CODING.  */
5936
5937 Lisp_Object
5938 coding_charset_list (struct coding_system *coding)
5939 {
5940   Lisp_Object attrs, charset_list;
5941
5942   CODING_GET_INFO (coding, attrs, charset_list);
5943   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5944     {
5945       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5946
5947       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5948         charset_list = Viso_2022_charset_list;
5949     }
5950   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5951     {
5952       charset_list = Vemacs_mule_charset_list;
5953     }
5954   return charset_list;
5955 }
5956
5957
5958 /* Return a list of charsets supported by CODING-SYSTEM.  */
5959
5960 Lisp_Object
5961 coding_system_charset_list (Lisp_Object coding_system)
5962 {
5963   ptrdiff_t id;
5964   Lisp_Object attrs, charset_list;
5965
5966   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5967   attrs = CODING_ID_ATTRS (id);
5968
5969   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5970     {
5971       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5972
5973       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5974         charset_list = Viso_2022_charset_list;
5975       else
5976         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5977     }
5978   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5979     {
5980       charset_list = Vemacs_mule_charset_list;
5981     }
5982   else
5983     {
5984       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5985     }
5986   return charset_list;
5987 }
5988
5989
5990 /* Return raw-text or one of its subsidiaries that has the same
5991    eol_type as CODING-SYSTEM.  */
5992
5993 Lisp_Object
5994 raw_text_coding_system (Lisp_Object coding_system)
5995 {
5996   Lisp_Object spec, attrs;
5997   Lisp_Object eol_type, raw_text_eol_type;
5998
5999   if (NILP (coding_system))
6000     return Qraw_text;
6001   spec = CODING_SYSTEM_SPEC (coding_system);
6002   attrs = AREF (spec, 0);
6003
6004   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6005     return coding_system;
6006
6007   eol_type = AREF (spec, 2);
6008   if (VECTORP (eol_type))
6009     return Qraw_text;
6010   spec = CODING_SYSTEM_SPEC (Qraw_text);
6011   raw_text_eol_type = AREF (spec, 2);
6012   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6013           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6014           : AREF (raw_text_eol_type, 2));
6015 }
6016
6017
6018 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6019    the subsidiary that has the same eol-spec as PARENT (if it is not
6020    nil and specifies end-of-line format) or the system's setting
6021    (system_eol_type).  */
6022
6023 Lisp_Object
6024 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6025 {
6026   Lisp_Object spec, eol_type;
6027
6028   if (NILP (coding_system))
6029     coding_system = Qraw_text;
6030   spec = CODING_SYSTEM_SPEC (coding_system);
6031   eol_type = AREF (spec, 2);
6032   if (VECTORP (eol_type))
6033     {
6034       Lisp_Object parent_eol_type;
6035
6036       if (! NILP (parent))
6037         {
6038           Lisp_Object parent_spec;
6039
6040           parent_spec = CODING_SYSTEM_SPEC (parent);
6041           parent_eol_type = AREF (parent_spec, 2);
6042           if (VECTORP (parent_eol_type))
6043             parent_eol_type = system_eol_type;
6044         }
6045       else
6046         parent_eol_type = system_eol_type;
6047       if (EQ (parent_eol_type, Qunix))
6048         coding_system = AREF (eol_type, 0);
6049       else if (EQ (parent_eol_type, Qdos))
6050         coding_system = AREF (eol_type, 1);
6051       else if (EQ (parent_eol_type, Qmac))
6052         coding_system = AREF (eol_type, 2);
6053     }
6054   return coding_system;
6055 }
6056
6057
6058 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6059    decided for writing to a process.  If not, complement them, and
6060    return a new coding system.  */
6061
6062 Lisp_Object
6063 complement_process_encoding_system (Lisp_Object coding_system)
6064 {
6065   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6066   Lisp_Object spec, attrs;
6067   int i;
6068
6069   for (i = 0; i < 3; i++)
6070     {
6071       if (i == 1)
6072         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6073       else if (i == 2)
6074         coding_system = preferred_coding_system ();
6075       spec = CODING_SYSTEM_SPEC (coding_system);
6076       if (NILP (spec))
6077         continue;
6078       attrs = AREF (spec, 0);
6079       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6080         coding_base = CODING_ATTR_BASE_NAME (attrs);
6081       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6082         eol_base = coding_system;
6083       if (! NILP (coding_base) && ! NILP (eol_base))
6084         break;
6085     }
6086
6087   if (i > 0)
6088     /* The original CODING_SYSTEM didn't specify text-conversion or
6089        eol-conversion.  Be sure that we return a fully complemented
6090        coding system.  */
6091     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6092   return coding_system;
6093 }
6094
6095
6096 /* Emacs has a mechanism to automatically detect a coding system if it
6097    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6098    it's impossible to distinguish some coding systems accurately
6099    because they use the same range of codes.  So, at first, coding
6100    systems are categorized into 7, those are:
6101
6102    o coding-category-emacs-mule
6103
6104         The category for a coding system which has the same code range
6105         as Emacs' internal format.  Assigned the coding-system (Lisp
6106         symbol) `emacs-mule' by default.
6107
6108    o coding-category-sjis
6109
6110         The category for a coding system which has the same code range
6111         as SJIS.  Assigned the coding-system (Lisp
6112         symbol) `japanese-shift-jis' by default.
6113
6114    o coding-category-iso-7
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 7-bit environment.  This doesn't use any locking
6118         shift and single shift functions.  This can encode/decode all
6119         charsets.  Assigned the coding-system (Lisp symbol)
6120         `iso-2022-7bit' by default.
6121
6122    o coding-category-iso-7-tight
6123
6124         Same as coding-category-iso-7 except that this can
6125         encode/decode only the specified charsets.
6126
6127    o coding-category-iso-8-1
6128
6129         The category for a coding system which has the same code range
6130         as ISO2022 of 8-bit environment and graphic plane 1 used only
6131         for DIMENSION1 charset.  This doesn't use any locking shift
6132         and single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `iso-latin-1' by default.
6134
6135    o coding-category-iso-8-2
6136
6137         The category for a coding system which has the same code range
6138         as ISO2022 of 8-bit environment and graphic plane 1 used only
6139         for DIMENSION2 charset.  This doesn't use any locking shift
6140         and single shift functions.  Assigned the coding-system (Lisp
6141         symbol) `japanese-iso-8bit' by default.
6142
6143    o coding-category-iso-7-else
6144
6145         The category for a coding system which has the same code range
6146         as ISO2022 of 7-bit environment but uses locking shift or
6147         single shift functions.  Assigned the coding-system (Lisp
6148         symbol) `iso-2022-7bit-lock' by default.
6149
6150    o coding-category-iso-8-else
6151
6152         The category for a coding system which has the same code range
6153         as ISO2022 of 8-bit environment but uses locking shift or
6154         single shift functions.  Assigned the coding-system (Lisp
6155         symbol) `iso-2022-8bit-ss2' by default.
6156
6157    o coding-category-big5
6158
6159         The category for a coding system which has the same code range
6160         as BIG5.  Assigned the coding-system (Lisp symbol)
6161         `cn-big5' by default.
6162
6163    o coding-category-utf-8
6164
6165         The category for a coding system which has the same code range
6166         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6167         symbol) `utf-8' by default.
6168
6169    o coding-category-utf-16-be
6170
6171         The category for a coding system in which a text has an
6172         Unicode signature (cf. Unicode Standard) in the order of BIG
6173         endian at the head.  Assigned the coding-system (Lisp symbol)
6174         `utf-16-be' by default.
6175
6176    o coding-category-utf-16-le
6177
6178         The category for a coding system in which a text has an
6179         Unicode signature (cf. Unicode Standard) in the order of
6180         LITTLE endian at the head.  Assigned the coding-system (Lisp
6181         symbol) `utf-16-le' by default.
6182
6183    o coding-category-ccl
6184
6185         The category for a coding system of which encoder/decoder is
6186         written in CCL programs.  The default value is nil, i.e., no
6187         coding system is assigned.
6188
6189    o coding-category-binary
6190
6191         The category for a coding system not categorized in any of the
6192         above.  Assigned the coding-system (Lisp symbol)
6193         `no-conversion' by default.
6194
6195    Each of them is a Lisp symbol and the value is an actual
6196    `coding-system's (this is also a Lisp symbol) assigned by a user.
6197    What Emacs does actually is to detect a category of coding system.
6198    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6199    decide only one possible category, it selects a category of the
6200    highest priority.  Priorities of categories are also specified by a
6201    user in a Lisp variable `coding-category-list'.
6202
6203 */
6204
6205 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6206                                            int eol_seen);
6207
6208
6209 /* Return the number of ASCII characters at the head of the source.
6210    By side effects, set coding->head_ascii and update
6211    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6212    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6213    reliable only when all the source bytes are ASCII.  */
6214
6215 static ptrdiff_t
6216 check_ascii (struct coding_system *coding)
6217 {
6218   const unsigned char *src, *end;
6219   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6220   int eol_seen = coding->eol_seen;
6221
6222   coding_set_source (coding);
6223   src = coding->source;
6224   end = src + coding->src_bytes;
6225
6226   if (inhibit_eol_conversion
6227       || SYMBOLP (eol_type))
6228     {
6229       /* We don't have to check EOL format.  */
6230       while (src < end && !( *src & 0x80))
6231         {
6232           if (*src++ == '\n')
6233             eol_seen |= EOL_SEEN_LF;
6234         }
6235     }
6236   else
6237     {
6238       end--;                /* We look ahead one byte for "CR LF".  */
6239       while (src < end)
6240         {
6241           int c = *src;
6242
6243           if (c & 0x80)
6244             break;
6245           src++;
6246           if (c == '\r')
6247             {
6248               if (*src == '\n')
6249                 {
6250                   eol_seen |= EOL_SEEN_CRLF;
6251                   src++;
6252                 }
6253               else
6254                 eol_seen |= EOL_SEEN_CR;
6255             }
6256           else if (c == '\n')
6257             eol_seen |= EOL_SEEN_LF;
6258         }
6259       if (src == end)
6260         {
6261           int c = *src;
6262
6263           /* All bytes but the last one C are ASCII.  */
6264           if (! (c & 0x80))
6265             {
6266               if (c == '\r')
6267                 eol_seen |= EOL_SEEN_CR;
6268               else if (c  == '\n')
6269                 eol_seen |= EOL_SEEN_LF;
6270               src++;
6271             }
6272         }
6273     }
6274   coding->head_ascii = src - coding->source;
6275   coding->eol_seen = eol_seen;
6276   return (coding->head_ascii);
6277 }
6278
6279
6280 /* Return the number of characters at the source if all the bytes are
6281    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6282    effects, update coding->eol_seen.  The value of coding->eol_seen is
6283    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6284    the value is reliable only when all the source bytes are valid
6285    UTF-8.  */
6286
6287 static ptrdiff_t
6288 check_utf_8 (struct coding_system *coding)
6289 {
6290   const unsigned char *src, *end;
6291   int eol_seen;
6292   ptrdiff_t nchars = coding->head_ascii;
6293
6294   if (coding->head_ascii < 0)
6295     check_ascii (coding);
6296   else
6297     coding_set_source (coding);
6298   src = coding->source + coding->head_ascii;
6299   /* We look ahead one byte for CR LF.  */
6300   end = coding->source + coding->src_bytes - 1;
6301   eol_seen = coding->eol_seen;
6302   while (src < end)
6303     {
6304       int c = *src;
6305
6306       if (UTF_8_1_OCTET_P (*src))
6307         {
6308           src++;
6309           if (c < 0x20)
6310             {
6311               if (c == '\r')
6312                 {
6313                   if (*src == '\n')
6314                     {
6315                       eol_seen |= EOL_SEEN_CRLF;
6316                       src++;
6317                       nchars++;
6318                     }
6319                   else
6320                     eol_seen |= EOL_SEEN_CR;
6321                 }
6322               else if (c == '\n')
6323                 eol_seen |= EOL_SEEN_LF;
6324             }
6325         }
6326       else if (UTF_8_2_OCTET_LEADING_P (c))
6327         {
6328           if (c < 0xC2          /* overlong sequence */
6329               || src + 1 >= end
6330               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6331             return -1;
6332           src += 2;
6333         }
6334       else if (UTF_8_3_OCTET_LEADING_P (c))
6335         {
6336           if (src + 2 >= end
6337               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6338                     && UTF_8_EXTRA_OCTET_P (src[2])))
6339             return -1;
6340           c = (((c & 0xF) << 12)
6341                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6342           if (c < 0x800                       /* overlong sequence */
6343               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6344             return -1;
6345           src += 3;
6346         }
6347       else if (UTF_8_4_OCTET_LEADING_P (c))
6348         {
6349           if (src + 3 >= end
6350               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6351                     && UTF_8_EXTRA_OCTET_P (src[2])
6352                     && UTF_8_EXTRA_OCTET_P (src[3])))
6353             return -1;
6354           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6355                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6356           if (c < 0x10000       /* overlong sequence */
6357               || c >= 0x110000) /* non-Unicode character  */
6358             return -1;
6359           src += 4;
6360         }
6361       else
6362         return -1;
6363       nchars++;
6364     }
6365
6366   if (src == end)
6367     {
6368       if (! UTF_8_1_OCTET_P (*src))
6369         return -1;
6370       nchars++;
6371       if (*src == '\r')
6372         eol_seen |= EOL_SEEN_CR;
6373       else if (*src  == '\n')
6374         eol_seen |= EOL_SEEN_LF;
6375     }
6376   coding->eol_seen = eol_seen;
6377   return nchars;
6378 }
6379
6380
6381 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6382    SOURCE is encoded.  If CATEGORY is one of
6383    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6384    two-byte, else they are encoded by one-byte.
6385
6386    Return one of EOL_SEEN_XXX.  */
6387
6388 #define MAX_EOL_CHECK_COUNT 3
6389
6390 static int
6391 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6392             enum coding_category category)
6393 {
6394   const unsigned char *src = source, *src_end = src + src_bytes;
6395   unsigned char c;
6396   int total  = 0;
6397   int eol_seen = EOL_SEEN_NONE;
6398
6399   if ((1 << category) & CATEGORY_MASK_UTF_16)
6400     {
6401       bool msb = category == (coding_category_utf_16_le
6402                               | coding_category_utf_16_le_nosig);
6403       bool lsb = !msb;
6404
6405       while (src + 1 < src_end)
6406         {
6407           c = src[lsb];
6408           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6409             {
6410               int this_eol;
6411
6412               if (c == '\n')
6413                 this_eol = EOL_SEEN_LF;
6414               else if (src + 3 >= src_end
6415                        || src[msb + 2] != 0
6416                        || src[lsb + 2] != '\n')
6417                 this_eol = EOL_SEEN_CR;
6418               else
6419                 {
6420                   this_eol = EOL_SEEN_CRLF;
6421                   src += 2;
6422                 }
6423
6424               if (eol_seen == EOL_SEEN_NONE)
6425                 /* This is the first end-of-line.  */
6426                 eol_seen = this_eol;
6427               else if (eol_seen != this_eol)
6428                 {
6429                   /* The found type is different from what found before.
6430                      Allow for stray ^M characters in DOS EOL files.  */
6431                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6432                       || (eol_seen == EOL_SEEN_CRLF
6433                           && this_eol == EOL_SEEN_CR))
6434                     eol_seen = EOL_SEEN_CRLF;
6435                   else
6436                     {
6437                       eol_seen = EOL_SEEN_LF;
6438                       break;
6439                     }
6440                 }
6441               if (++total == MAX_EOL_CHECK_COUNT)
6442                 break;
6443             }
6444           src += 2;
6445         }
6446     }
6447   else
6448     while (src < src_end)
6449       {
6450         c = *src++;
6451         if (c == '\n' || c == '\r')
6452           {
6453             int this_eol;
6454
6455             if (c == '\n')
6456               this_eol = EOL_SEEN_LF;
6457             else if (src >= src_end || *src != '\n')
6458               this_eol = EOL_SEEN_CR;
6459             else
6460               this_eol = EOL_SEEN_CRLF, src++;
6461
6462             if (eol_seen == EOL_SEEN_NONE)
6463               /* This is the first end-of-line.  */
6464               eol_seen = this_eol;
6465             else if (eol_seen != this_eol)
6466               {
6467                 /* The found type is different from what found before.
6468                    Allow for stray ^M characters in DOS EOL files.  */
6469                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6470                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6471                   eol_seen = EOL_SEEN_CRLF;
6472                 else
6473                   {
6474                     eol_seen = EOL_SEEN_LF;
6475                     break;
6476                   }
6477               }
6478             if (++total == MAX_EOL_CHECK_COUNT)
6479               break;
6480           }
6481       }
6482   return eol_seen;
6483 }
6484
6485
6486 static Lisp_Object
6487 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6488 {
6489   Lisp_Object eol_type;
6490
6491   eol_type = CODING_ID_EOL_TYPE (coding->id);
6492   if (! VECTORP (eol_type))
6493     /* Already adjusted.  */
6494     return eol_type;
6495   if (eol_seen & EOL_SEEN_LF)
6496     {
6497       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6498       eol_type = Qunix;
6499     }
6500   else if (eol_seen & EOL_SEEN_CRLF)
6501     {
6502       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6503       eol_type = Qdos;
6504     }
6505   else if (eol_seen & EOL_SEEN_CR)
6506     {
6507       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6508       eol_type = Qmac;
6509     }
6510   return eol_type;
6511 }
6512
6513 /* Detect how a text specified in CODING is encoded.  If a coding
6514    system is detected, update fields of CODING by the detected coding
6515    system.  */
6516
6517 static void
6518 detect_coding (struct coding_system *coding)
6519 {
6520   const unsigned char *src, *src_end;
6521   unsigned int saved_mode = coding->mode;
6522   Lisp_Object found = Qnil;
6523   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6524
6525   coding->consumed = coding->consumed_char = 0;
6526   coding->produced = coding->produced_char = 0;
6527   coding_set_source (coding);
6528
6529   src_end = coding->source + coding->src_bytes;
6530
6531   coding->eol_seen = EOL_SEEN_NONE;
6532   /* If we have not yet decided the text encoding type, detect it
6533      now.  */
6534   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6535     {
6536       int c, i;
6537       struct coding_detection_info detect_info;
6538       bool null_byte_found = 0, eight_bit_found = 0;
6539       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6540                                        inhibit_null_byte_detection);
6541       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6542                                        inhibit_iso_escape_detection);
6543       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6544
6545       coding->head_ascii = 0;
6546       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6547       for (src = coding->source; src < src_end; src++)
6548         {
6549           c = *src;
6550           if (c & 0x80)
6551             {
6552               eight_bit_found = 1;
6553               if (null_byte_found)
6554                 break;
6555             }
6556           else if (c < 0x20)
6557             {
6558               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6559                   && ! inhibit_ied
6560                   && ! detect_info.checked)
6561                 {
6562                   if (detect_coding_iso_2022 (coding, &detect_info))
6563                     {
6564                       /* We have scanned the whole data.  */
6565                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6566                         {
6567                           /* We didn't find an 8-bit code.  We may
6568                              have found a null-byte, but it's very
6569                              rare that a binary file conforms to
6570                              ISO-2022.  */
6571                           src = src_end;
6572                           coding->head_ascii = src - coding->source;
6573                         }
6574                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6575                       break;
6576                     }
6577                 }
6578               else if (! c && !inhibit_nbd)
6579                 {
6580                   null_byte_found = 1;
6581                   if (eight_bit_found)
6582                     break;
6583                 }
6584               else if (! disable_ascii_optimization
6585                        && ! inhibit_eol_conversion)
6586                 {
6587                   if (c == '\r')
6588                     {
6589                       if (src < src_end && src[1] == '\n')
6590                         {
6591                           coding->eol_seen |= EOL_SEEN_CRLF;
6592                           src++;
6593                           if (! eight_bit_found)
6594                             coding->head_ascii++;
6595                         }
6596                       else
6597                         coding->eol_seen |= EOL_SEEN_CR;
6598                     }
6599                   else if (c == '\n')
6600                     {
6601                       coding->eol_seen |= EOL_SEEN_LF;
6602                     }
6603                 }
6604
6605               if (! eight_bit_found)
6606                 coding->head_ascii++;
6607             }
6608           else if (! eight_bit_found)
6609             coding->head_ascii++;
6610         }
6611
6612       if (null_byte_found || eight_bit_found
6613           || coding->head_ascii < coding->src_bytes
6614           || detect_info.found)
6615         {
6616           enum coding_category category;
6617           struct coding_system *this;
6618
6619           if (coding->head_ascii == coding->src_bytes)
6620             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6621             for (i = 0; i < coding_category_raw_text; i++)
6622               {
6623                 category = coding_priorities[i];
6624                 this = coding_categories + category;
6625                 if (detect_info.found & (1 << category))
6626                   break;
6627               }
6628           else
6629             {
6630               if (null_byte_found)
6631                 {
6632                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6633                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6634                 }
6635               else if (prefer_utf_8
6636                        && detect_coding_utf_8 (coding, &detect_info))
6637                 {
6638                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6639                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6640                 }
6641               for (i = 0; i < coding_category_raw_text; i++)
6642                 {
6643                   category = coding_priorities[i];
6644                   this = coding_categories + category;
6645                   /* Some of this->detector (e.g. detect_coding_sjis)
6646                      require this information.  */
6647                   coding->id = this->id;
6648                   if (this->id < 0)
6649                     {
6650                       /* No coding system of this category is defined.  */
6651                       detect_info.rejected |= (1 << category);
6652                     }
6653                   else if (category >= coding_category_raw_text)
6654                     continue;
6655                   else if (detect_info.checked & (1 << category))
6656                     {
6657                       if (detect_info.found & (1 << category))
6658                         break;
6659                     }
6660                   else if ((*(this->detector)) (coding, &detect_info)
6661                            && detect_info.found & (1 << category))
6662                     break;
6663                 }
6664             }
6665
6666           if (i < coding_category_raw_text)
6667             {
6668               if (category == coding_category_utf_8_auto)
6669                 {
6670                   Lisp_Object coding_systems;
6671
6672                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6673                                          coding_attr_utf_bom);
6674                   if (CONSP (coding_systems))
6675                     {
6676                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6677                         found = XCAR (coding_systems);
6678                       else
6679                         found = XCDR (coding_systems);
6680                     }
6681                   else
6682                     found = CODING_ID_NAME (this->id);
6683                 }
6684               else if (category == coding_category_utf_16_auto)
6685                 {
6686                   Lisp_Object coding_systems;
6687
6688                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6689                                          coding_attr_utf_bom);
6690                   if (CONSP (coding_systems))
6691                     {
6692                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6693                         found = XCAR (coding_systems);
6694                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6695                         found = XCDR (coding_systems);
6696                     }
6697                   else
6698                     found = CODING_ID_NAME (this->id);
6699                 }
6700               else
6701                 found = CODING_ID_NAME (this->id);
6702             }
6703           else if (null_byte_found)
6704             found = Qno_conversion;
6705           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6706                    == CATEGORY_MASK_ANY)
6707             found = Qraw_text;
6708           else if (detect_info.rejected)
6709             for (i = 0; i < coding_category_raw_text; i++)
6710               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6711                 {
6712                   this = coding_categories + coding_priorities[i];
6713                   found = CODING_ID_NAME (this->id);
6714                   break;
6715                 }
6716         }
6717     }
6718   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6719            == coding_category_utf_8_auto)
6720     {
6721       Lisp_Object coding_systems;
6722       struct coding_detection_info detect_info;
6723
6724       coding_systems
6725         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6726       detect_info.found = detect_info.rejected = 0;
6727       if (check_ascii (coding) == coding->src_bytes)
6728         {
6729           if (CONSP (coding_systems))
6730             found = XCDR (coding_systems);
6731         }
6732       else
6733         {
6734           if (CONSP (coding_systems)
6735               && detect_coding_utf_8 (coding, &detect_info))
6736             {
6737               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6738                 found = XCAR (coding_systems);
6739               else
6740                 found = XCDR (coding_systems);
6741             }
6742         }
6743     }
6744   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6745            == coding_category_utf_16_auto)
6746     {
6747       Lisp_Object coding_systems;
6748       struct coding_detection_info detect_info;
6749
6750       coding_systems
6751         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6752       detect_info.found = detect_info.rejected = 0;
6753       coding->head_ascii = 0;
6754       if (CONSP (coding_systems)
6755           && detect_coding_utf_16 (coding, &detect_info))
6756         {
6757           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6758             found = XCAR (coding_systems);
6759           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6760             found = XCDR (coding_systems);
6761         }
6762     }
6763
6764   if (! NILP (found))
6765     {
6766       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6767                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6768                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6769                            : EOL_SEEN_LF);
6770
6771       setup_coding_system (found, coding);
6772       if (specified_eol != EOL_SEEN_NONE)
6773         adjust_coding_eol_type (coding, specified_eol);
6774     }
6775
6776   coding->mode = saved_mode;
6777 }
6778
6779
6780 static void
6781 decode_eol (struct coding_system *coding)
6782 {
6783   Lisp_Object eol_type;
6784   unsigned char *p, *pbeg, *pend;
6785
6786   eol_type = CODING_ID_EOL_TYPE (coding->id);
6787   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6788     return;
6789
6790   if (NILP (coding->dst_object))
6791     pbeg = coding->destination;
6792   else
6793     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6794   pend = pbeg + coding->produced;
6795
6796   if (VECTORP (eol_type))
6797     {
6798       int eol_seen = EOL_SEEN_NONE;
6799
6800       for (p = pbeg; p < pend; p++)
6801         {
6802           if (*p == '\n')
6803             eol_seen |= EOL_SEEN_LF;
6804           else if (*p == '\r')
6805             {
6806               if (p + 1 < pend && *(p + 1) == '\n')
6807                 {
6808                   eol_seen |= EOL_SEEN_CRLF;
6809                   p++;
6810                 }
6811               else
6812                 eol_seen |= EOL_SEEN_CR;
6813             }
6814         }
6815       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6816       if ((eol_seen & EOL_SEEN_CRLF) != 0
6817           && (eol_seen & EOL_SEEN_CR) != 0
6818           && (eol_seen & EOL_SEEN_LF) == 0)
6819         eol_seen = EOL_SEEN_CRLF;
6820       else if (eol_seen != EOL_SEEN_NONE
6821           && eol_seen != EOL_SEEN_LF
6822           && eol_seen != EOL_SEEN_CRLF
6823           && eol_seen != EOL_SEEN_CR)
6824         eol_seen = EOL_SEEN_LF;
6825       if (eol_seen != EOL_SEEN_NONE)
6826         eol_type = adjust_coding_eol_type (coding, eol_seen);
6827     }
6828
6829   if (EQ (eol_type, Qmac))
6830     {
6831       for (p = pbeg; p < pend; p++)
6832         if (*p == '\r')
6833           *p = '\n';
6834     }
6835   else if (EQ (eol_type, Qdos))
6836     {
6837       ptrdiff_t n = 0;
6838
6839       if (NILP (coding->dst_object))
6840         {
6841           /* Start deleting '\r' from the tail to minimize the memory
6842              movement.  */
6843           for (p = pend - 2; p >= pbeg; p--)
6844             if (*p == '\r')
6845               {
6846                 memmove (p, p + 1, pend-- - p - 1);
6847                 n++;
6848               }
6849         }
6850       else
6851         {
6852           ptrdiff_t pos_byte = coding->dst_pos_byte;
6853           ptrdiff_t pos = coding->dst_pos;
6854           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6855
6856           while (pos < pos_end)
6857             {
6858               p = BYTE_POS_ADDR (pos_byte);
6859               if (*p == '\r' && p[1] == '\n')
6860                 {
6861                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6862                   n++;
6863                   pos_end--;
6864                 }
6865               pos++;
6866               if (coding->dst_multibyte)
6867                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6868               else
6869                 pos_byte++;
6870             }
6871         }
6872       coding->produced -= n;
6873       coding->produced_char -= n;
6874     }
6875 }
6876
6877
6878 /* Return a translation table (or list of them) from coding system
6879    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6880    not ENCODEP). */
6881
6882 static Lisp_Object
6883 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6884 {
6885   Lisp_Object standard, translation_table;
6886   Lisp_Object val;
6887
6888   if (NILP (Venable_character_translation))
6889     {
6890       if (max_lookup)
6891         *max_lookup = 0;
6892       return Qnil;
6893     }
6894   if (encodep)
6895     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6896       standard = Vstandard_translation_table_for_encode;
6897   else
6898     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6899       standard = Vstandard_translation_table_for_decode;
6900   if (NILP (translation_table))
6901     translation_table = standard;
6902   else
6903     {
6904       if (SYMBOLP (translation_table))
6905         translation_table = Fget (translation_table, Qtranslation_table);
6906       else if (CONSP (translation_table))
6907         {
6908           translation_table = Fcopy_sequence (translation_table);
6909           for (val = translation_table; CONSP (val); val = XCDR (val))
6910             if (SYMBOLP (XCAR (val)))
6911               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6912         }
6913       if (CHAR_TABLE_P (standard))
6914         {
6915           if (CONSP (translation_table))
6916             translation_table = nconc2 (translation_table, list1 (standard));
6917           else
6918             translation_table = list2 (translation_table, standard);
6919         }
6920     }
6921
6922   if (max_lookup)
6923     {
6924       *max_lookup = 1;
6925       if (CHAR_TABLE_P (translation_table)
6926           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6927         {
6928           val = XCHAR_TABLE (translation_table)->extras[1];
6929           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6930             *max_lookup = XFASTINT (val);
6931         }
6932       else if (CONSP (translation_table))
6933         {
6934           Lisp_Object tail;
6935
6936           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6937             if (CHAR_TABLE_P (XCAR (tail))
6938                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6939               {
6940                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6941                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6942                   *max_lookup = XFASTINT (tailval);
6943               }
6944         }
6945     }
6946   return translation_table;
6947 }
6948
6949 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6950   do {                                                          \
6951     trans = Qnil;                                               \
6952     if (CHAR_TABLE_P (table))                                   \
6953       {                                                         \
6954         trans = CHAR_TABLE_REF (table, c);                      \
6955         if (CHARACTERP (trans))                                 \
6956           c = XFASTINT (trans), trans = Qnil;                   \
6957       }                                                         \
6958     else if (CONSP (table))                                     \
6959       {                                                         \
6960         Lisp_Object tail;                                       \
6961                                                                 \
6962         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6963           if (CHAR_TABLE_P (XCAR (tail)))                       \
6964             {                                                   \
6965               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6966               if (CHARACTERP (trans))                           \
6967                 c = XFASTINT (trans), trans = Qnil;             \
6968               else if (! NILP (trans))                          \
6969                 break;                                          \
6970             }                                                   \
6971       }                                                         \
6972   } while (0)
6973
6974
6975 /* Return a translation of character(s) at BUF according to TRANS.
6976    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6977    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6978    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6979    translation is found, and Qnil if not found..
6980    If BUF is too short to lookup characters in FROM, return Qt.  */
6981
6982 static Lisp_Object
6983 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6984 {
6985
6986   if (INTEGERP (trans))
6987     return trans;
6988   for (; CONSP (trans); trans = XCDR (trans))
6989     {
6990       Lisp_Object val = XCAR (trans);
6991       Lisp_Object from = XCAR (val);
6992       ptrdiff_t len = ASIZE (from);
6993       ptrdiff_t i;
6994
6995       for (i = 0; i < len; i++)
6996         {
6997           if (buf + i == buf_end)
6998             return Qt;
6999           if (XINT (AREF (from, i)) != buf[i])
7000             break;
7001         }
7002       if (i == len)
7003         return val;
7004     }
7005   return Qnil;
7006 }
7007
7008
7009 static int
7010 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7011                bool last_block)
7012 {
7013   unsigned char *dst = coding->destination + coding->produced;
7014   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7015   ptrdiff_t produced;
7016   ptrdiff_t produced_chars = 0;
7017   int carryover = 0;
7018
7019   if (! coding->chars_at_source)
7020     {
7021       /* Source characters are in coding->charbuf.  */
7022       int *buf = coding->charbuf;
7023       int *buf_end = buf + coding->charbuf_used;
7024
7025       if (EQ (coding->src_object, coding->dst_object))
7026         {
7027           coding_set_source (coding);
7028           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7029         }
7030
7031       while (buf < buf_end)
7032         {
7033           int c = *buf;
7034           ptrdiff_t i;
7035
7036           if (c >= 0)
7037             {
7038               ptrdiff_t from_nchars = 1, to_nchars = 1;
7039               Lisp_Object trans = Qnil;
7040
7041               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7042               if (! NILP (trans))
7043                 {
7044                   trans = get_translation (trans, buf, buf_end);
7045                   if (INTEGERP (trans))
7046                     c = XINT (trans);
7047                   else if (CONSP (trans))
7048                     {
7049                       from_nchars = ASIZE (XCAR (trans));
7050                       trans = XCDR (trans);
7051                       if (INTEGERP (trans))
7052                         c = XINT (trans);
7053                       else
7054                         {
7055                           to_nchars = ASIZE (trans);
7056                           c = XINT (AREF (trans, 0));
7057                         }
7058                     }
7059                   else if (EQ (trans, Qt) && ! last_block)
7060                     break;
7061                 }
7062
7063               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7064                 {
7065                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7066                        / MAX_MULTIBYTE_LENGTH)
7067                       < to_nchars)
7068                     memory_full (SIZE_MAX);
7069                   dst = alloc_destination (coding,
7070                                            buf_end - buf
7071                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7072                                            dst);
7073                   if (EQ (coding->src_object, coding->dst_object))
7074                     {
7075                       coding_set_source (coding);
7076                       dst_end = (((unsigned char *) coding->source)
7077                                  + coding->consumed);
7078                     }
7079                   else
7080                     dst_end = coding->destination + coding->dst_bytes;
7081                 }
7082
7083               for (i = 0; i < to_nchars; i++)
7084                 {
7085                   if (i > 0)
7086                     c = XINT (AREF (trans, i));
7087                   if (coding->dst_multibyte
7088                       || ! CHAR_BYTE8_P (c))
7089                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7090                   else
7091                     *dst++ = CHAR_TO_BYTE8 (c);
7092                 }
7093               produced_chars += to_nchars;
7094               buf += from_nchars;
7095             }
7096           else
7097             /* This is an annotation datum.  (-C) is the length.  */
7098             buf += -c;
7099         }
7100       carryover = buf_end - buf;
7101     }
7102   else
7103     {
7104       /* Source characters are at coding->source.  */
7105       const unsigned char *src = coding->source;
7106       const unsigned char *src_end = src + coding->consumed;
7107
7108       if (EQ (coding->dst_object, coding->src_object))
7109         dst_end = (unsigned char *) src;
7110       if (coding->src_multibyte != coding->dst_multibyte)
7111         {
7112           if (coding->src_multibyte)
7113             {
7114               bool multibytep = 1;
7115               ptrdiff_t consumed_chars = 0;
7116
7117               while (1)
7118                 {
7119                   const unsigned char *src_base = src;
7120                   int c;
7121
7122                   ONE_MORE_BYTE (c);
7123                   if (dst == dst_end)
7124                     {
7125                       if (EQ (coding->src_object, coding->dst_object))
7126                         dst_end = (unsigned char *) src;
7127                       if (dst == dst_end)
7128                         {
7129                           ptrdiff_t offset = src - coding->source;
7130
7131                           dst = alloc_destination (coding, src_end - src + 1,
7132                                                    dst);
7133                           dst_end = coding->destination + coding->dst_bytes;
7134                           coding_set_source (coding);
7135                           src = coding->source + offset;
7136                           src_end = coding->source + coding->consumed;
7137                           if (EQ (coding->src_object, coding->dst_object))
7138                             dst_end = (unsigned char *) src;
7139                         }
7140                     }
7141                   *dst++ = c;
7142                   produced_chars++;
7143                 }
7144             no_more_source:
7145               ;
7146             }
7147           else
7148             while (src < src_end)
7149               {
7150                 bool multibytep = 1;
7151                 int c = *src++;
7152
7153                 if (dst >= dst_end - 1)
7154                   {
7155                     if (EQ (coding->src_object, coding->dst_object))
7156                       dst_end = (unsigned char *) src;
7157                     if (dst >= dst_end - 1)
7158                       {
7159                         ptrdiff_t offset = src - coding->source;
7160                         ptrdiff_t more_bytes;
7161
7162                         if (EQ (coding->src_object, coding->dst_object))
7163                           more_bytes = ((src_end - src) / 2) + 2;
7164                         else
7165                           more_bytes = src_end - src + 2;
7166                         dst = alloc_destination (coding, more_bytes, dst);
7167                         dst_end = coding->destination + coding->dst_bytes;
7168                         coding_set_source (coding);
7169                         src = coding->source + offset;
7170                         src_end = coding->source + coding->consumed;
7171                         if (EQ (coding->src_object, coding->dst_object))
7172                           dst_end = (unsigned char *) src;
7173                       }
7174                   }
7175                 EMIT_ONE_BYTE (c);
7176               }
7177         }
7178       else
7179         {
7180           if (!EQ (coding->src_object, coding->dst_object))
7181             {
7182               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7183
7184               if (require > 0)
7185                 {
7186                   ptrdiff_t offset = src - coding->source;
7187
7188                   dst = alloc_destination (coding, require, dst);
7189                   coding_set_source (coding);
7190                   src = coding->source + offset;
7191                   src_end = coding->source + coding->consumed;
7192                 }
7193             }
7194           produced_chars = coding->consumed_char;
7195           while (src < src_end)
7196             *dst++ = *src++;
7197         }
7198     }
7199
7200   produced = dst - (coding->destination + coding->produced);
7201   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7202     insert_from_gap (produced_chars, produced, 0);
7203   coding->produced += produced;
7204   coding->produced_char += produced_chars;
7205   return carryover;
7206 }
7207
7208 /* Compose text in CODING->object according to the annotation data at
7209    CHARBUF.  CHARBUF is an array:
7210      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7211  */
7212
7213 static void
7214 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7215 {
7216   int len;
7217   ptrdiff_t to;
7218   enum composition_method method;
7219   Lisp_Object components;
7220
7221   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7222   to = pos + charbuf[2];
7223   method = (enum composition_method) (charbuf[4]);
7224
7225   if (method == COMPOSITION_RELATIVE)
7226     components = Qnil;
7227   else
7228     {
7229       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7230       int i, j;
7231
7232       if (method == COMPOSITION_WITH_RULE)
7233         len = charbuf[2] * 3 - 2;
7234       charbuf += MAX_ANNOTATION_LENGTH;
7235       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7236       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7237         {
7238           if (charbuf[i] >= 0)
7239             args[j] = make_number (charbuf[i]);
7240           else
7241             {
7242               i++;
7243               args[j] = make_number (charbuf[i] % 0x100);
7244             }
7245         }
7246       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7247     }
7248   compose_text (pos, to, components, Qnil, coding->dst_object);
7249 }
7250
7251
7252 /* Put `charset' property on text in CODING->object according to
7253    the annotation data at CHARBUF.  CHARBUF is an array:
7254      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7255  */
7256
7257 static void
7258 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7259 {
7260   ptrdiff_t from = pos - charbuf[2];
7261   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7262
7263   Fput_text_property (make_number (from), make_number (pos),
7264                       Qcharset, CHARSET_NAME (charset),
7265                       coding->dst_object);
7266 }
7267
7268 #define MAX_CHARBUF_SIZE 0x4000
7269 #define MIN_CHARBUF_SIZE 0x10
7270
7271 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7272   do {                                                          \
7273     int units = ((size) > MAX_CHARBUF_SIZE ? MAX_CHARBUF_SIZE   \
7274                  : (size) < MIN_CHARBUF_SIZE ? MIN_CHARBUF_SIZE \
7275                  : size);                                       \
7276     coding->charbuf = SAFE_ALLOCA ((units) * sizeof (int));     \
7277     coding->charbuf_size = (units);                             \
7278   } while (0)
7279
7280
7281 static void
7282 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7283 {
7284   int *charbuf = coding->charbuf;
7285   int *charbuf_end = charbuf + coding->charbuf_used;
7286
7287   if (NILP (coding->dst_object))
7288     return;
7289
7290   while (charbuf < charbuf_end)
7291     {
7292       if (*charbuf >= 0)
7293         pos++, charbuf++;
7294       else
7295         {
7296           int len = -*charbuf;
7297
7298           if (len > 2)
7299             switch (charbuf[1])
7300               {
7301               case CODING_ANNOTATE_COMPOSITION_MASK:
7302                 produce_composition (coding, charbuf, pos);
7303                 break;
7304               case CODING_ANNOTATE_CHARSET_MASK:
7305                 produce_charset (coding, charbuf, pos);
7306                 break;
7307               }
7308           charbuf += len;
7309         }
7310     }
7311 }
7312
7313 /* Decode the data at CODING->src_object into CODING->dst_object.
7314    CODING->src_object is a buffer, a string, or nil.
7315    CODING->dst_object is a buffer.
7316
7317    If CODING->src_object is a buffer, it must be the current buffer.
7318    In this case, if CODING->src_pos is positive, it is a position of
7319    the source text in the buffer, otherwise, the source text is in the
7320    gap area of the buffer, and CODING->src_pos specifies the offset of
7321    the text from GPT (which must be the same as PT).  If this is the
7322    same buffer as CODING->dst_object, CODING->src_pos must be
7323    negative.
7324
7325    If CODING->src_object is a string, CODING->src_pos is an index to
7326    that string.
7327
7328    If CODING->src_object is nil, CODING->source must already point to
7329    the non-relocatable memory area.  In this case, CODING->src_pos is
7330    an offset from CODING->source.
7331
7332    The decoded data is inserted at the current point of the buffer
7333    CODING->dst_object.
7334 */
7335
7336 static void
7337 decode_coding (struct coding_system *coding)
7338 {
7339   Lisp_Object attrs;
7340   Lisp_Object undo_list;
7341   Lisp_Object translation_table;
7342   struct ccl_spec cclspec;
7343   int carryover;
7344   int i;
7345
7346   USE_SAFE_ALLOCA;
7347
7348   if (BUFFERP (coding->src_object)
7349       && coding->src_pos > 0
7350       && coding->src_pos < GPT
7351       && coding->src_pos + coding->src_chars > GPT)
7352     move_gap_both (coding->src_pos, coding->src_pos_byte);
7353
7354   undo_list = Qt;
7355   if (BUFFERP (coding->dst_object))
7356     {
7357       set_buffer_internal (XBUFFER (coding->dst_object));
7358       if (GPT != PT)
7359         move_gap_both (PT, PT_BYTE);
7360
7361       /* We must disable undo_list in order to record the whole insert
7362          transaction via record_insert at the end.  But doing so also
7363          disables the recording of the first change to the undo_list.
7364          Therefore we check for first change here and record it via
7365          record_first_change if needed.  */
7366       if (MODIFF <= SAVE_MODIFF)
7367         record_first_change ();
7368
7369       undo_list = BVAR (current_buffer, undo_list);
7370       bset_undo_list (current_buffer, Qt);
7371     }
7372
7373   coding->consumed = coding->consumed_char = 0;
7374   coding->produced = coding->produced_char = 0;
7375   coding->chars_at_source = 0;
7376   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7377   coding->errors = 0;
7378
7379   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7380
7381   attrs = CODING_ID_ATTRS (coding->id);
7382   translation_table = get_translation_table (attrs, 0, NULL);
7383
7384   carryover = 0;
7385   if (coding->decoder == decode_coding_ccl)
7386     {
7387       coding->spec.ccl = &cclspec;
7388       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7389     }
7390   do
7391     {
7392       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7393
7394       coding_set_source (coding);
7395       coding->annotated = 0;
7396       coding->charbuf_used = carryover;
7397       (*(coding->decoder)) (coding);
7398       coding_set_destination (coding);
7399       carryover = produce_chars (coding, translation_table, 0);
7400       if (coding->annotated)
7401         produce_annotation (coding, pos);
7402       for (i = 0; i < carryover; i++)
7403         coding->charbuf[i]
7404           = coding->charbuf[coding->charbuf_used - carryover + i];
7405     }
7406   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7407          || (coding->consumed < coding->src_bytes
7408              && (coding->result == CODING_RESULT_SUCCESS
7409                  || coding->result == CODING_RESULT_INVALID_SRC)));
7410
7411   if (carryover > 0)
7412     {
7413       coding_set_destination (coding);
7414       coding->charbuf_used = carryover;
7415       produce_chars (coding, translation_table, 1);
7416     }
7417
7418   coding->carryover_bytes = 0;
7419   if (coding->consumed < coding->src_bytes)
7420     {
7421       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7422       const unsigned char *src;
7423
7424       coding_set_source (coding);
7425       coding_set_destination (coding);
7426       src = coding->source + coding->consumed;
7427
7428       if (coding->mode & CODING_MODE_LAST_BLOCK)
7429         {
7430           /* Flush out unprocessed data as binary chars.  We are sure
7431              that the number of data is less than the size of
7432              coding->charbuf.  */
7433           coding->charbuf_used = 0;
7434           coding->chars_at_source = 0;
7435
7436           while (nbytes-- > 0)
7437             {
7438               int c = *src++;
7439
7440               if (c & 0x80)
7441                 c = BYTE8_TO_CHAR (c);
7442               coding->charbuf[coding->charbuf_used++] = c;
7443             }
7444           produce_chars (coding, Qnil, 1);
7445         }
7446       else
7447         {
7448           /* Record unprocessed bytes in coding->carryover.  We are
7449              sure that the number of data is less than the size of
7450              coding->carryover.  */
7451           unsigned char *p = coding->carryover;
7452
7453           if (nbytes > sizeof coding->carryover)
7454             nbytes = sizeof coding->carryover;
7455           coding->carryover_bytes = nbytes;
7456           while (nbytes-- > 0)
7457             *p++ = *src++;
7458         }
7459       coding->consumed = coding->src_bytes;
7460     }
7461
7462   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7463       && !inhibit_eol_conversion)
7464     decode_eol (coding);
7465   if (BUFFERP (coding->dst_object))
7466     {
7467       bset_undo_list (current_buffer, undo_list);
7468       record_insert (coding->dst_pos, coding->produced_char);
7469     }
7470
7471   SAFE_FREE ();
7472 }
7473
7474
7475 /* Extract an annotation datum from a composition starting at POS and
7476    ending before LIMIT of CODING->src_object (buffer or string), store
7477    the data in BUF, set *STOP to a starting position of the next
7478    composition (if any) or to LIMIT, and return the address of the
7479    next element of BUF.
7480
7481    If such an annotation is not found, set *STOP to a starting
7482    position of a composition after POS (if any) or to LIMIT, and
7483    return BUF.  */
7484
7485 static int *
7486 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7487                                struct coding_system *coding, int *buf,
7488                                ptrdiff_t *stop)
7489 {
7490   ptrdiff_t start, end;
7491   Lisp_Object prop;
7492
7493   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7494       || end > limit)
7495     *stop = limit;
7496   else if (start > pos)
7497     *stop = start;
7498   else
7499     {
7500       if (start == pos)
7501         {
7502           /* We found a composition.  Store the corresponding
7503              annotation data in BUF.  */
7504           int *head = buf;
7505           enum composition_method method = composition_method (prop);
7506           int nchars = COMPOSITION_LENGTH (prop);
7507
7508           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7509           if (method != COMPOSITION_RELATIVE)
7510             {
7511               Lisp_Object components;
7512               ptrdiff_t i, len, i_byte;
7513
7514               components = COMPOSITION_COMPONENTS (prop);
7515               if (VECTORP (components))
7516                 {
7517                   len = ASIZE (components);
7518                   for (i = 0; i < len; i++)
7519                     *buf++ = XINT (AREF (components, i));
7520                 }
7521               else if (STRINGP (components))
7522                 {
7523                   len = SCHARS (components);
7524                   i = i_byte = 0;
7525                   while (i < len)
7526                     {
7527                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7528                       buf++;
7529                     }
7530                 }
7531               else if (INTEGERP (components))
7532                 {
7533                   len = 1;
7534                   *buf++ = XINT (components);
7535                 }
7536               else if (CONSP (components))
7537                 {
7538                   for (len = 0; CONSP (components);
7539                        len++, components = XCDR (components))
7540                     *buf++ = XINT (XCAR (components));
7541                 }
7542               else
7543                 emacs_abort ();
7544               *head -= len;
7545             }
7546         }
7547
7548       if (find_composition (end, limit, &start, &end, &prop,
7549                             coding->src_object)
7550           && end <= limit)
7551         *stop = start;
7552       else
7553         *stop = limit;
7554     }
7555   return buf;
7556 }
7557
7558
7559 /* Extract an annotation datum from a text property `charset' at POS of
7560    CODING->src_object (buffer of string), store the data in BUF, set
7561    *STOP to the position where the value of `charset' property changes
7562    (limiting by LIMIT), and return the address of the next element of
7563    BUF.
7564
7565    If the property value is nil, set *STOP to the position where the
7566    property value is non-nil (limiting by LIMIT), and return BUF.  */
7567
7568 static int *
7569 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7570                            struct coding_system *coding, int *buf,
7571                            ptrdiff_t *stop)
7572 {
7573   Lisp_Object val, next;
7574   int id;
7575
7576   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7577   if (! NILP (val) && CHARSETP (val))
7578     id = XINT (CHARSET_SYMBOL_ID (val));
7579   else
7580     id = -1;
7581   ADD_CHARSET_DATA (buf, 0, id);
7582   next = Fnext_single_property_change (make_number (pos), Qcharset,
7583                                        coding->src_object,
7584                                        make_number (limit));
7585   *stop = XINT (next);
7586   return buf;
7587 }
7588
7589
7590 static void
7591 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7592                int max_lookup)
7593 {
7594   int *buf = coding->charbuf;
7595   int *buf_end = coding->charbuf + coding->charbuf_size;
7596   const unsigned char *src = coding->source + coding->consumed;
7597   const unsigned char *src_end = coding->source + coding->src_bytes;
7598   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7599   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7600   bool multibytep = coding->src_multibyte;
7601   Lisp_Object eol_type;
7602   int c;
7603   ptrdiff_t stop, stop_composition, stop_charset;
7604   int *lookup_buf = NULL;
7605
7606   if (! NILP (translation_table))
7607     lookup_buf = alloca (sizeof (int) * max_lookup);
7608
7609   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7610   if (VECTORP (eol_type))
7611     eol_type = Qunix;
7612
7613   /* Note: composition handling is not yet implemented.  */
7614   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7615
7616   if (NILP (coding->src_object))
7617     stop = stop_composition = stop_charset = end_pos;
7618   else
7619     {
7620       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7621         stop = stop_composition = pos;
7622       else
7623         stop = stop_composition = end_pos;
7624       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7625         stop = stop_charset = pos;
7626       else
7627         stop_charset = end_pos;
7628     }
7629
7630   /* Compensate for CRLF and conversion.  */
7631   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7632   while (buf < buf_end)
7633     {
7634       Lisp_Object trans;
7635
7636       if (pos == stop)
7637         {
7638           if (pos == end_pos)
7639             break;
7640           if (pos == stop_composition)
7641             buf = handle_composition_annotation (pos, end_pos, coding,
7642                                                  buf, &stop_composition);
7643           if (pos == stop_charset)
7644             buf = handle_charset_annotation (pos, end_pos, coding,
7645                                              buf, &stop_charset);
7646           stop = (stop_composition < stop_charset
7647                   ? stop_composition : stop_charset);
7648         }
7649
7650       if (! multibytep)
7651         {
7652           int bytes;
7653
7654           if (coding->encoder == encode_coding_raw_text
7655               || coding->encoder == encode_coding_ccl)
7656             c = *src++, pos++;
7657           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7658             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7659           else
7660             c = BYTE8_TO_CHAR (*src), src++, pos++;
7661         }
7662       else
7663         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7664       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7665         c = '\n';
7666       if (! EQ (eol_type, Qunix))
7667         {
7668           if (c == '\n')
7669             {
7670               if (EQ (eol_type, Qdos))
7671                 *buf++ = '\r';
7672               else
7673                 c = '\r';
7674             }
7675         }
7676
7677       trans = Qnil;
7678       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7679       if (NILP (trans))
7680         *buf++ = c;
7681       else
7682         {
7683           ptrdiff_t from_nchars = 1, to_nchars = 1;
7684           int *lookup_buf_end;
7685           const unsigned char *p = src;
7686           int i;
7687
7688           lookup_buf[0] = c;
7689           for (i = 1; i < max_lookup && p < src_end; i++)
7690             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7691           lookup_buf_end = lookup_buf + i;
7692           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7693           if (INTEGERP (trans))
7694             c = XINT (trans);
7695           else if (CONSP (trans))
7696             {
7697               from_nchars = ASIZE (XCAR (trans));
7698               trans = XCDR (trans);
7699               if (INTEGERP (trans))
7700                 c = XINT (trans);
7701               else
7702                 {
7703                   to_nchars = ASIZE (trans);
7704                   if (buf_end - buf < to_nchars)
7705                     break;
7706                   c = XINT (AREF (trans, 0));
7707                 }
7708             }
7709           else
7710             break;
7711           *buf++ = c;
7712           for (i = 1; i < to_nchars; i++)
7713             *buf++ = XINT (AREF (trans, i));
7714           for (i = 1; i < from_nchars; i++, pos++)
7715             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7716         }
7717     }
7718
7719   coding->consumed = src - coding->source;
7720   coding->consumed_char = pos - coding->src_pos;
7721   coding->charbuf_used = buf - coding->charbuf;
7722   coding->chars_at_source = 0;
7723 }
7724
7725
7726 /* Encode the text at CODING->src_object into CODING->dst_object.
7727    CODING->src_object is a buffer or a string.
7728    CODING->dst_object is a buffer or nil.
7729
7730    If CODING->src_object is a buffer, it must be the current buffer.
7731    In this case, if CODING->src_pos is positive, it is a position of
7732    the source text in the buffer, otherwise. the source text is in the
7733    gap area of the buffer, and coding->src_pos specifies the offset of
7734    the text from GPT (which must be the same as PT).  If this is the
7735    same buffer as CODING->dst_object, CODING->src_pos must be
7736    negative and CODING should not have `pre-write-conversion'.
7737
7738    If CODING->src_object is a string, CODING should not have
7739    `pre-write-conversion'.
7740
7741    If CODING->dst_object is a buffer, the encoded data is inserted at
7742    the current point of that buffer.
7743
7744    If CODING->dst_object is nil, the encoded data is placed at the
7745    memory area specified by CODING->destination.  */
7746
7747 static void
7748 encode_coding (struct coding_system *coding)
7749 {
7750   Lisp_Object attrs;
7751   Lisp_Object translation_table;
7752   int max_lookup;
7753   struct ccl_spec cclspec;
7754
7755   USE_SAFE_ALLOCA;
7756
7757   attrs = CODING_ID_ATTRS (coding->id);
7758   if (coding->encoder == encode_coding_raw_text)
7759     translation_table = Qnil, max_lookup = 0;
7760   else
7761     translation_table = get_translation_table (attrs, 1, &max_lookup);
7762
7763   if (BUFFERP (coding->dst_object))
7764     {
7765       set_buffer_internal (XBUFFER (coding->dst_object));
7766       coding->dst_multibyte
7767         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7768     }
7769
7770   coding->consumed = coding->consumed_char = 0;
7771   coding->produced = coding->produced_char = 0;
7772   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7773   coding->errors = 0;
7774
7775   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7776
7777   if (coding->encoder == encode_coding_ccl)
7778     {
7779       coding->spec.ccl = &cclspec;
7780       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7781     }
7782   do {
7783     coding_set_source (coding);
7784     consume_chars (coding, translation_table, max_lookup);
7785     coding_set_destination (coding);
7786     (*(coding->encoder)) (coding);
7787   } while (coding->consumed_char < coding->src_chars);
7788
7789   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7790     insert_from_gap (coding->produced_char, coding->produced, 0);
7791
7792   SAFE_FREE ();
7793 }
7794
7795
7796 /* Name (or base name) of work buffer for code conversion.  */
7797 static Lisp_Object Vcode_conversion_workbuf_name;
7798
7799 /* A working buffer used by the top level conversion.  Once it is
7800    created, it is never destroyed.  It has the name
7801    Vcode_conversion_workbuf_name.  The other working buffers are
7802    destroyed after the use is finished, and their names are modified
7803    versions of Vcode_conversion_workbuf_name.  */
7804 static Lisp_Object Vcode_conversion_reused_workbuf;
7805
7806 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7807 static bool reused_workbuf_in_use;
7808
7809
7810 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7811    multibyteness of returning buffer.  */
7812
7813 static Lisp_Object
7814 make_conversion_work_buffer (bool multibyte)
7815 {
7816   Lisp_Object name, workbuf;
7817   struct buffer *current;
7818
7819   if (reused_workbuf_in_use)
7820     {
7821       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7822       workbuf = Fget_buffer_create (name);
7823     }
7824   else
7825     {
7826       reused_workbuf_in_use = 1;
7827       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7828         Vcode_conversion_reused_workbuf
7829           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7830       workbuf = Vcode_conversion_reused_workbuf;
7831     }
7832   current = current_buffer;
7833   set_buffer_internal (XBUFFER (workbuf));
7834   /* We can't allow modification hooks to run in the work buffer.  For
7835      instance, directory_files_internal assumes that file decoding
7836      doesn't compile new regexps.  */
7837   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7838   Ferase_buffer ();
7839   bset_undo_list (current_buffer, Qt);
7840   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7841   set_buffer_internal (current);
7842   return workbuf;
7843 }
7844
7845
7846 static void
7847 code_conversion_restore (Lisp_Object arg)
7848 {
7849   Lisp_Object current, workbuf;
7850   struct gcpro gcpro1;
7851
7852   GCPRO1 (arg);
7853   current = XCAR (arg);
7854   workbuf = XCDR (arg);
7855   if (! NILP (workbuf))
7856     {
7857       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7858         reused_workbuf_in_use = 0;
7859       else
7860         Fkill_buffer (workbuf);
7861     }
7862   set_buffer_internal (XBUFFER (current));
7863   UNGCPRO;
7864 }
7865
7866 Lisp_Object
7867 code_conversion_save (bool with_work_buf, bool multibyte)
7868 {
7869   Lisp_Object workbuf = Qnil;
7870
7871   if (with_work_buf)
7872     workbuf = make_conversion_work_buffer (multibyte);
7873   record_unwind_protect (code_conversion_restore,
7874                          Fcons (Fcurrent_buffer (), workbuf));
7875   return workbuf;
7876 }
7877
7878 void
7879 decode_coding_gap (struct coding_system *coding,
7880                    ptrdiff_t chars, ptrdiff_t bytes)
7881 {
7882   ptrdiff_t count = SPECPDL_INDEX ();
7883   Lisp_Object attrs;
7884
7885   coding->src_object = Fcurrent_buffer ();
7886   coding->src_chars = chars;
7887   coding->src_bytes = bytes;
7888   coding->src_pos = -chars;
7889   coding->src_pos_byte = -bytes;
7890   coding->src_multibyte = chars < bytes;
7891   coding->dst_object = coding->src_object;
7892   coding->dst_pos = PT;
7893   coding->dst_pos_byte = PT_BYTE;
7894   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7895
7896   coding->head_ascii = -1;
7897   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7898   coding->eol_seen = EOL_SEEN_NONE;
7899   if (CODING_REQUIRE_DETECTION (coding))
7900     detect_coding (coding);
7901   attrs = CODING_ID_ATTRS (coding->id);
7902   if (! disable_ascii_optimization
7903       && ! coding->src_multibyte
7904       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7905       && NILP (CODING_ATTR_POST_READ (attrs))
7906       && NILP (get_translation_table (attrs, 0, NULL)))
7907     {
7908       chars = coding->head_ascii;
7909       if (chars < 0)
7910         chars = check_ascii (coding);
7911       if (chars != bytes)
7912         {
7913           /* There exists a non-ASCII byte.  */
7914           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7915               && coding->detected_utf8_bytes == coding->src_bytes)
7916             {
7917               if (coding->detected_utf8_chars >= 0)
7918                 chars = coding->detected_utf8_chars;
7919               else
7920                 chars = check_utf_8 (coding);
7921               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7922                   && coding->head_ascii == 0
7923                   && coding->source[0] == UTF_8_BOM_1
7924                   && coding->source[1] == UTF_8_BOM_2
7925                   && coding->source[2] == UTF_8_BOM_3)
7926                 {
7927                   chars--;
7928                   bytes -= 3;
7929                   coding->src_bytes -= 3;
7930                 }
7931             }
7932           else
7933             chars = -1;
7934         }
7935       if (chars >= 0)
7936         {
7937           Lisp_Object eol_type;
7938
7939           eol_type = CODING_ID_EOL_TYPE (coding->id);
7940           if (VECTORP (eol_type))
7941             {
7942               if (coding->eol_seen != EOL_SEEN_NONE)
7943                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7944             }
7945           if (EQ (eol_type, Qmac))
7946             {
7947               unsigned char *src_end = GAP_END_ADDR;
7948               unsigned char *src = src_end - coding->src_bytes;
7949
7950               while (src < src_end)
7951                 {
7952                   if (*src++ == '\r')
7953                     src[-1] = '\n';
7954                 }
7955             }
7956           else if (EQ (eol_type, Qdos))
7957             {
7958               unsigned char *src = GAP_END_ADDR;
7959               unsigned char *src_beg = src - coding->src_bytes;
7960               unsigned char *dst = src;
7961               ptrdiff_t diff;
7962
7963               while (src_beg < src)
7964                 {
7965                   *--dst = *--src;
7966                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7967                     src--;
7968                 }
7969               diff = dst - src;
7970               bytes -= diff;
7971               chars -= diff;
7972             }
7973           coding->produced = bytes;
7974           coding->produced_char = chars;
7975           insert_from_gap (chars, bytes, 1);
7976           return;
7977         }
7978     }
7979   code_conversion_save (0, 0);
7980
7981   coding->mode |= CODING_MODE_LAST_BLOCK;
7982   current_buffer->text->inhibit_shrinking = 1;
7983   decode_coding (coding);
7984   current_buffer->text->inhibit_shrinking = 0;
7985
7986   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7987     {
7988       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7989       Lisp_Object val;
7990
7991       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7992       val = call1 (CODING_ATTR_POST_READ (attrs),
7993                    make_number (coding->produced_char));
7994       CHECK_NATNUM (val);
7995       coding->produced_char += Z - prev_Z;
7996       coding->produced += Z_BYTE - prev_Z_BYTE;
7997     }
7998
7999   unbind_to (count, Qnil);
8000 }
8001
8002
8003 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8004    SRC_OBJECT into DST_OBJECT by coding context CODING.
8005
8006    SRC_OBJECT is a buffer, a string, or Qnil.
8007
8008    If it is a buffer, the text is at point of the buffer.  FROM and TO
8009    are positions in the buffer.
8010
8011    If it is a string, the text is at the beginning of the string.
8012    FROM and TO are indices to the string.
8013
8014    If it is nil, the text is at coding->source.  FROM and TO are
8015    indices to coding->source.
8016
8017    DST_OBJECT is a buffer, Qt, or Qnil.
8018
8019    If it is a buffer, the decoded text is inserted at point of the
8020    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8021    is deleted.
8022
8023    If it is Qt, a string is made from the decoded text, and
8024    set in CODING->dst_object.
8025
8026    If it is Qnil, the decoded text is stored at CODING->destination.
8027    The caller must allocate CODING->dst_bytes bytes at
8028    CODING->destination by xmalloc.  If the decoded text is longer than
8029    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8030  */
8031
8032 void
8033 decode_coding_object (struct coding_system *coding,
8034                       Lisp_Object src_object,
8035                       ptrdiff_t from, ptrdiff_t from_byte,
8036                       ptrdiff_t to, ptrdiff_t to_byte,
8037                       Lisp_Object dst_object)
8038 {
8039   ptrdiff_t count = SPECPDL_INDEX ();
8040   unsigned char *destination IF_LINT (= NULL);
8041   ptrdiff_t dst_bytes IF_LINT (= 0);
8042   ptrdiff_t chars = to - from;
8043   ptrdiff_t bytes = to_byte - from_byte;
8044   Lisp_Object attrs;
8045   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8046   bool need_marker_adjustment = 0;
8047   Lisp_Object old_deactivate_mark;
8048
8049   old_deactivate_mark = Vdeactivate_mark;
8050
8051   if (NILP (dst_object))
8052     {
8053       destination = coding->destination;
8054       dst_bytes = coding->dst_bytes;
8055     }
8056
8057   coding->src_object = src_object;
8058   coding->src_chars = chars;
8059   coding->src_bytes = bytes;
8060   coding->src_multibyte = chars < bytes;
8061
8062   if (STRINGP (src_object))
8063     {
8064       coding->src_pos = from;
8065       coding->src_pos_byte = from_byte;
8066     }
8067   else if (BUFFERP (src_object))
8068     {
8069       set_buffer_internal (XBUFFER (src_object));
8070       if (from != GPT)
8071         move_gap_both (from, from_byte);
8072       if (EQ (src_object, dst_object))
8073         {
8074           struct Lisp_Marker *tail;
8075
8076           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8077             {
8078               tail->need_adjustment
8079                 = tail->charpos == (tail->insertion_type ? from : to);
8080               need_marker_adjustment |= tail->need_adjustment;
8081             }
8082           saved_pt = PT, saved_pt_byte = PT_BYTE;
8083           TEMP_SET_PT_BOTH (from, from_byte);
8084           current_buffer->text->inhibit_shrinking = 1;
8085           del_range_both (from, from_byte, to, to_byte, 1);
8086           coding->src_pos = -chars;
8087           coding->src_pos_byte = -bytes;
8088         }
8089       else
8090         {
8091           coding->src_pos = from;
8092           coding->src_pos_byte = from_byte;
8093         }
8094     }
8095
8096   if (CODING_REQUIRE_DETECTION (coding))
8097     detect_coding (coding);
8098   attrs = CODING_ID_ATTRS (coding->id);
8099
8100   if (EQ (dst_object, Qt)
8101       || (! NILP (CODING_ATTR_POST_READ (attrs))
8102           && NILP (dst_object)))
8103     {
8104       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8105       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8106       coding->dst_pos = BEG;
8107       coding->dst_pos_byte = BEG_BYTE;
8108     }
8109   else if (BUFFERP (dst_object))
8110     {
8111       code_conversion_save (0, 0);
8112       coding->dst_object = dst_object;
8113       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8114       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8115       coding->dst_multibyte
8116         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8117     }
8118   else
8119     {
8120       code_conversion_save (0, 0);
8121       coding->dst_object = Qnil;
8122       /* Most callers presume this will return a multibyte result, and they
8123          won't use `binary' or `raw-text' anyway, so let's not worry about
8124          CODING_FOR_UNIBYTE.  */
8125       coding->dst_multibyte = 1;
8126     }
8127
8128   decode_coding (coding);
8129
8130   if (BUFFERP (coding->dst_object))
8131     set_buffer_internal (XBUFFER (coding->dst_object));
8132
8133   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8134     {
8135       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8136       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8137       Lisp_Object val;
8138
8139       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8140       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8141               old_deactivate_mark);
8142       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8143                         make_number (coding->produced_char));
8144       UNGCPRO;
8145       CHECK_NATNUM (val);
8146       coding->produced_char += Z - prev_Z;
8147       coding->produced += Z_BYTE - prev_Z_BYTE;
8148     }
8149
8150   if (EQ (dst_object, Qt))
8151     {
8152       coding->dst_object = Fbuffer_string ();
8153     }
8154   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8155     {
8156       set_buffer_internal (XBUFFER (coding->dst_object));
8157       if (dst_bytes < coding->produced)
8158         {
8159           eassert (coding->produced > 0);
8160           destination = xrealloc (destination, coding->produced);
8161           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8162             move_gap_both (BEGV, BEGV_BYTE);
8163           memcpy (destination, BEGV_ADDR, coding->produced);
8164           coding->destination = destination;
8165         }
8166     }
8167
8168   if (saved_pt >= 0)
8169     {
8170       /* This is the case of:
8171          (BUFFERP (src_object) && EQ (src_object, dst_object))
8172          As we have moved PT while replacing the original buffer
8173          contents, we must recover it now.  */
8174       set_buffer_internal (XBUFFER (src_object));
8175       current_buffer->text->inhibit_shrinking = 0;
8176       if (saved_pt < from)
8177         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8178       else if (saved_pt < from + chars)
8179         TEMP_SET_PT_BOTH (from, from_byte);
8180       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8181         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8182                           saved_pt_byte + (coding->produced - bytes));
8183       else
8184         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8185                           saved_pt_byte + (coding->produced - bytes));
8186
8187       if (need_marker_adjustment)
8188         {
8189           struct Lisp_Marker *tail;
8190
8191           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8192             if (tail->need_adjustment)
8193               {
8194                 tail->need_adjustment = 0;
8195                 if (tail->insertion_type)
8196                   {
8197                     tail->bytepos = from_byte;
8198                     tail->charpos = from;
8199                   }
8200                 else
8201                   {
8202                     tail->bytepos = from_byte + coding->produced;
8203                     tail->charpos
8204                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8205                          ? tail->bytepos : from + coding->produced_char);
8206                   }
8207               }
8208         }
8209     }
8210
8211   Vdeactivate_mark = old_deactivate_mark;
8212   unbind_to (count, coding->dst_object);
8213 }
8214
8215
8216 void
8217 encode_coding_object (struct coding_system *coding,
8218                       Lisp_Object src_object,
8219                       ptrdiff_t from, ptrdiff_t from_byte,
8220                       ptrdiff_t to, ptrdiff_t to_byte,
8221                       Lisp_Object dst_object)
8222 {
8223   ptrdiff_t count = SPECPDL_INDEX ();
8224   ptrdiff_t chars = to - from;
8225   ptrdiff_t bytes = to_byte - from_byte;
8226   Lisp_Object attrs;
8227   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8228   bool need_marker_adjustment = 0;
8229   bool kill_src_buffer = 0;
8230   Lisp_Object old_deactivate_mark;
8231
8232   old_deactivate_mark = Vdeactivate_mark;
8233
8234   coding->src_object = src_object;
8235   coding->src_chars = chars;
8236   coding->src_bytes = bytes;
8237   coding->src_multibyte = chars < bytes;
8238
8239   attrs = CODING_ID_ATTRS (coding->id);
8240
8241   if (EQ (src_object, dst_object))
8242     {
8243       struct Lisp_Marker *tail;
8244
8245       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8246         {
8247           tail->need_adjustment
8248             = tail->charpos == (tail->insertion_type ? from : to);
8249           need_marker_adjustment |= tail->need_adjustment;
8250         }
8251     }
8252
8253   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8254     {
8255       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8256       set_buffer_internal (XBUFFER (coding->src_object));
8257       if (STRINGP (src_object))
8258         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8259       else if (BUFFERP (src_object))
8260         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8261       else
8262         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8263
8264       if (EQ (src_object, dst_object))
8265         {
8266           set_buffer_internal (XBUFFER (src_object));
8267           saved_pt = PT, saved_pt_byte = PT_BYTE;
8268           del_range_both (from, from_byte, to, to_byte, 1);
8269           set_buffer_internal (XBUFFER (coding->src_object));
8270         }
8271
8272       {
8273         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8274
8275         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8276                 old_deactivate_mark);
8277         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8278                     make_number (BEG), make_number (Z));
8279         UNGCPRO;
8280       }
8281       if (XBUFFER (coding->src_object) != current_buffer)
8282         kill_src_buffer = 1;
8283       coding->src_object = Fcurrent_buffer ();
8284       if (BEG != GPT)
8285         move_gap_both (BEG, BEG_BYTE);
8286       coding->src_chars = Z - BEG;
8287       coding->src_bytes = Z_BYTE - BEG_BYTE;
8288       coding->src_pos = BEG;
8289       coding->src_pos_byte = BEG_BYTE;
8290       coding->src_multibyte = Z < Z_BYTE;
8291     }
8292   else if (STRINGP (src_object))
8293     {
8294       code_conversion_save (0, 0);
8295       coding->src_pos = from;
8296       coding->src_pos_byte = from_byte;
8297     }
8298   else if (BUFFERP (src_object))
8299     {
8300       code_conversion_save (0, 0);
8301       set_buffer_internal (XBUFFER (src_object));
8302       if (EQ (src_object, dst_object))
8303         {
8304           saved_pt = PT, saved_pt_byte = PT_BYTE;
8305           coding->src_object = del_range_1 (from, to, 1, 1);
8306           coding->src_pos = 0;
8307           coding->src_pos_byte = 0;
8308         }
8309       else
8310         {
8311           if (from < GPT && to >= GPT)
8312             move_gap_both (from, from_byte);
8313           coding->src_pos = from;
8314           coding->src_pos_byte = from_byte;
8315         }
8316     }
8317   else
8318     code_conversion_save (0, 0);
8319
8320   if (BUFFERP (dst_object))
8321     {
8322       coding->dst_object = dst_object;
8323       if (EQ (src_object, dst_object))
8324         {
8325           coding->dst_pos = from;
8326           coding->dst_pos_byte = from_byte;
8327         }
8328       else
8329         {
8330           struct buffer *current = current_buffer;
8331
8332           set_buffer_temp (XBUFFER (dst_object));
8333           coding->dst_pos = PT;
8334           coding->dst_pos_byte = PT_BYTE;
8335           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8336           set_buffer_temp (current);
8337         }
8338       coding->dst_multibyte
8339         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8340     }
8341   else if (EQ (dst_object, Qt))
8342     {
8343       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8344       coding->dst_object = Qnil;
8345       coding->destination = xmalloc_atomic (dst_bytes);
8346       coding->dst_bytes = dst_bytes;
8347       coding->dst_multibyte = 0;
8348     }
8349   else
8350     {
8351       coding->dst_object = Qnil;
8352       coding->dst_multibyte = 0;
8353     }
8354
8355   encode_coding (coding);
8356
8357   if (EQ (dst_object, Qt))
8358     {
8359       if (BUFFERP (coding->dst_object))
8360         coding->dst_object = Fbuffer_string ();
8361       else if (coding->raw_destination)
8362         /* This is used to avoid creating huge Lisp string.
8363            NOTE: caller who sets `raw_destination' is also
8364            responsible for freeing `destination' buffer.  */
8365         coding->dst_object = Qnil;
8366       else
8367         {
8368           coding->dst_object
8369             = make_unibyte_string ((char *) coding->destination,
8370                                    coding->produced);
8371           xfree (coding->destination);
8372         }
8373     }
8374
8375   if (saved_pt >= 0)
8376     {
8377       /* This is the case of:
8378          (BUFFERP (src_object) && EQ (src_object, dst_object))
8379          As we have moved PT while replacing the original buffer
8380          contents, we must recover it now.  */
8381       set_buffer_internal (XBUFFER (src_object));
8382       if (saved_pt < from)
8383         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8384       else if (saved_pt < from + chars)
8385         TEMP_SET_PT_BOTH (from, from_byte);
8386       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8387         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8388                           saved_pt_byte + (coding->produced - bytes));
8389       else
8390         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8391                           saved_pt_byte + (coding->produced - bytes));
8392
8393       if (need_marker_adjustment)
8394         {
8395           struct Lisp_Marker *tail;
8396
8397           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8398             if (tail->need_adjustment)
8399               {
8400                 tail->need_adjustment = 0;
8401                 if (tail->insertion_type)
8402                   {
8403                     tail->bytepos = from_byte;
8404                     tail->charpos = from;
8405                   }
8406                 else
8407                   {
8408                     tail->bytepos = from_byte + coding->produced;
8409                     tail->charpos
8410                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8411                          ? tail->bytepos : from + coding->produced_char);
8412                   }
8413               }
8414         }
8415     }
8416
8417   if (kill_src_buffer)
8418     Fkill_buffer (coding->src_object);
8419
8420   Vdeactivate_mark = old_deactivate_mark;
8421   unbind_to (count, Qnil);
8422 }
8423
8424
8425 Lisp_Object
8426 preferred_coding_system (void)
8427 {
8428   int id = coding_categories[coding_priorities[0]].id;
8429
8430   return CODING_ID_NAME (id);
8431 }
8432
8433 #if defined (WINDOWSNT) || defined (CYGWIN)
8434
8435 Lisp_Object
8436 from_unicode (Lisp_Object str)
8437 {
8438   CHECK_STRING (str);
8439   if (!STRING_MULTIBYTE (str) &&
8440       SBYTES (str) & 1)
8441     {
8442       str = Fsubstring (str, make_number (0), make_number (-1));
8443     }
8444
8445   return code_convert_string_norecord (str, Qutf_16le, 0);
8446 }
8447
8448 Lisp_Object
8449 from_unicode_buffer (const wchar_t *wstr)
8450 {
8451     return from_unicode (
8452         make_unibyte_string (
8453             (char *) wstr,
8454             /* we get one of the two final 0 bytes for free. */
8455             1 + sizeof (wchar_t) * wcslen (wstr)));
8456 }
8457
8458 wchar_t *
8459 to_unicode (Lisp_Object str, Lisp_Object *buf)
8460 {
8461   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8462   /* We need to make another copy (in addition to the one made by
8463      code_convert_string_norecord) to ensure that the final string is
8464      _doubly_ zero terminated --- that is, that the string is
8465      terminated by two zero bytes and one utf-16le null character.
8466      Because strings are already terminated with a single zero byte,
8467      we just add one additional zero. */
8468   str = make_uninit_string (SBYTES (*buf) + 1);
8469   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8470   SDATA (str) [SBYTES (*buf)] = '\0';
8471   *buf = str;
8472   return WCSDATA (*buf);
8473 }
8474
8475 #endif /* WINDOWSNT || CYGWIN */
8476
8477 \f
8478 #ifdef emacs
8479 /*** 8. Emacs Lisp library functions ***/
8480
8481 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8482        doc: /* Return t if OBJECT is nil or a coding-system.
8483 See the documentation of `define-coding-system' for information
8484 about coding-system objects.  */)
8485   (Lisp_Object object)
8486 {
8487   if (NILP (object)
8488       || CODING_SYSTEM_ID (object) >= 0)
8489     return Qt;
8490   if (! SYMBOLP (object)
8491       || NILP (Fget (object, Qcoding_system_define_form)))
8492     return Qnil;
8493   return Qt;
8494 }
8495
8496 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8497        Sread_non_nil_coding_system, 1, 1, 0,
8498        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8499   (Lisp_Object prompt)
8500 {
8501   Lisp_Object val;
8502   do
8503     {
8504       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8505                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8506     }
8507   while (SCHARS (val) == 0);
8508   return (Fintern (val, Qnil));
8509 }
8510
8511 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8512        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8513 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8514 Ignores case when completing coding systems (all Emacs coding systems
8515 are lower-case).  */)
8516   (Lisp_Object prompt, Lisp_Object default_coding_system)
8517 {
8518   Lisp_Object val;
8519   ptrdiff_t count = SPECPDL_INDEX ();
8520
8521   if (SYMBOLP (default_coding_system))
8522     default_coding_system = SYMBOL_NAME (default_coding_system);
8523   specbind (Qcompletion_ignore_case, Qt);
8524   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8525                           Qt, Qnil, Qcoding_system_history,
8526                           default_coding_system, Qnil);
8527   unbind_to (count, Qnil);
8528   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8529 }
8530
8531 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8532        1, 1, 0,
8533        doc: /* Check validity of CODING-SYSTEM.
8534 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8535 It is valid if it is nil or a symbol defined as a coding system by the
8536 function `define-coding-system'.  */)
8537   (Lisp_Object coding_system)
8538 {
8539   Lisp_Object define_form;
8540
8541   define_form = Fget (coding_system, Qcoding_system_define_form);
8542   if (! NILP (define_form))
8543     {
8544       Fput (coding_system, Qcoding_system_define_form, Qnil);
8545       safe_eval (define_form);
8546     }
8547   if (!NILP (Fcoding_system_p (coding_system)))
8548     return coding_system;
8549   xsignal1 (Qcoding_system_error, coding_system);
8550 }
8551
8552 \f
8553 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8554    HIGHEST, return the coding system of the highest
8555    priority among the detected coding systems.  Otherwise return a
8556    list of detected coding systems sorted by their priorities.  If
8557    MULTIBYTEP, it is assumed that the bytes are in correct
8558    multibyte form but contains only ASCII and eight-bit chars.
8559    Otherwise, the bytes are raw bytes.
8560
8561    CODING-SYSTEM controls the detection as below:
8562
8563    If it is nil, detect both text-format and eol-format.  If the
8564    text-format part of CODING-SYSTEM is already specified
8565    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8566    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8567    detect only text-format.  */
8568
8569 Lisp_Object
8570 detect_coding_system (const unsigned char *src,
8571                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8572                       bool highest, bool multibytep,
8573                       Lisp_Object coding_system)
8574 {
8575   const unsigned char *src_end = src + src_bytes;
8576   Lisp_Object attrs, eol_type;
8577   Lisp_Object val = Qnil;
8578   struct coding_system coding;
8579   ptrdiff_t id;
8580   struct coding_detection_info detect_info;
8581   enum coding_category base_category;
8582   bool null_byte_found = 0, eight_bit_found = 0;
8583
8584   if (NILP (coding_system))
8585     coding_system = Qundecided;
8586   setup_coding_system (coding_system, &coding);
8587   attrs = CODING_ID_ATTRS (coding.id);
8588   eol_type = CODING_ID_EOL_TYPE (coding.id);
8589   coding_system = CODING_ATTR_BASE_NAME (attrs);
8590
8591   coding.source = src;
8592   coding.src_chars = src_chars;
8593   coding.src_bytes = src_bytes;
8594   coding.src_multibyte = multibytep;
8595   coding.consumed = 0;
8596   coding.mode |= CODING_MODE_LAST_BLOCK;
8597   coding.head_ascii = 0;
8598
8599   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8600
8601   /* At first, detect text-format if necessary.  */
8602   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8603   if (base_category == coding_category_undecided)
8604     {
8605       enum coding_category category IF_LINT (= 0);
8606       struct coding_system *this IF_LINT (= NULL);
8607       int c, i;
8608       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8609                                        inhibit_null_byte_detection);
8610       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8611                                        inhibit_iso_escape_detection);
8612       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8613
8614       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8615       for (; src < src_end; src++)
8616         {
8617           c = *src;
8618           if (c & 0x80)
8619             {
8620               eight_bit_found = 1;
8621               if (null_byte_found)
8622                 break;
8623             }
8624           else if (c < 0x20)
8625             {
8626               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8627                   && ! inhibit_ied
8628                   && ! detect_info.checked)
8629                 {
8630                   if (detect_coding_iso_2022 (&coding, &detect_info))
8631                     {
8632                       /* We have scanned the whole data.  */
8633                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8634                         {
8635                           /* We didn't find an 8-bit code.  We may
8636                              have found a null-byte, but it's very
8637                              rare that a binary file confirm to
8638                              ISO-2022.  */
8639                           src = src_end;
8640                           coding.head_ascii = src - coding.source;
8641                         }
8642                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8643                       break;
8644                     }
8645                 }
8646               else if (! c && !inhibit_nbd)
8647                 {
8648                   null_byte_found = 1;
8649                   if (eight_bit_found)
8650                     break;
8651                 }
8652               if (! eight_bit_found)
8653                 coding.head_ascii++;
8654             }
8655           else if (! eight_bit_found)
8656             coding.head_ascii++;
8657         }
8658
8659       if (null_byte_found || eight_bit_found
8660           || coding.head_ascii < coding.src_bytes
8661           || detect_info.found)
8662         {
8663           if (coding.head_ascii == coding.src_bytes)
8664             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8665             for (i = 0; i < coding_category_raw_text; i++)
8666               {
8667                 category = coding_priorities[i];
8668                 this = coding_categories + category;
8669                 if (detect_info.found & (1 << category))
8670                   break;
8671               }
8672           else
8673             {
8674               if (null_byte_found)
8675                 {
8676                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8677                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8678                 }
8679               else if (prefer_utf_8
8680                        && detect_coding_utf_8 (&coding, &detect_info))
8681                 {
8682                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8683                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8684                 }
8685               for (i = 0; i < coding_category_raw_text; i++)
8686                 {
8687                   category = coding_priorities[i];
8688                   this = coding_categories + category;
8689
8690                   if (this->id < 0)
8691                     {
8692                       /* No coding system of this category is defined.  */
8693                       detect_info.rejected |= (1 << category);
8694                     }
8695                   else if (category >= coding_category_raw_text)
8696                     continue;
8697                   else if (detect_info.checked & (1 << category))
8698                     {
8699                       if (highest
8700                           && (detect_info.found & (1 << category)))
8701                         break;
8702                     }
8703                   else if ((*(this->detector)) (&coding, &detect_info)
8704                            && highest
8705                            && (detect_info.found & (1 << category)))
8706                     {
8707                       if (category == coding_category_utf_16_auto)
8708                         {
8709                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8710                             category = coding_category_utf_16_le;
8711                           else
8712                             category = coding_category_utf_16_be;
8713                         }
8714                       break;
8715                     }
8716                 }
8717             }
8718         }
8719
8720       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8721           || null_byte_found)
8722         {
8723           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8724           id = CODING_SYSTEM_ID (Qno_conversion);
8725           val = list1 (make_number (id));
8726         }
8727       else if (! detect_info.rejected && ! detect_info.found)
8728         {
8729           detect_info.found = CATEGORY_MASK_ANY;
8730           id = coding_categories[coding_category_undecided].id;
8731           val = list1 (make_number (id));
8732         }
8733       else if (highest)
8734         {
8735           if (detect_info.found)
8736             {
8737               detect_info.found = 1 << category;
8738               val = list1 (make_number (this->id));
8739             }
8740           else
8741             for (i = 0; i < coding_category_raw_text; i++)
8742               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8743                 {
8744                   detect_info.found = 1 << coding_priorities[i];
8745                   id = coding_categories[coding_priorities[i]].id;
8746                   val = list1 (make_number (id));
8747                   break;
8748                 }
8749         }
8750       else
8751         {
8752           int mask = detect_info.rejected | detect_info.found;
8753           int found = 0;
8754
8755           for (i = coding_category_raw_text - 1; i >= 0; i--)
8756             {
8757               category = coding_priorities[i];
8758               if (! (mask & (1 << category)))
8759                 {
8760                   found |= 1 << category;
8761                   id = coding_categories[category].id;
8762                   if (id >= 0)
8763                     val = list1 (make_number (id));
8764                 }
8765             }
8766           for (i = coding_category_raw_text - 1; i >= 0; i--)
8767             {
8768               category = coding_priorities[i];
8769               if (detect_info.found & (1 << category))
8770                 {
8771                   id = coding_categories[category].id;
8772                   val = Fcons (make_number (id), val);
8773                 }
8774             }
8775           detect_info.found |= found;
8776         }
8777     }
8778   else if (base_category == coding_category_utf_8_auto)
8779     {
8780       if (detect_coding_utf_8 (&coding, &detect_info))
8781         {
8782           struct coding_system *this;
8783
8784           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8785             this = coding_categories + coding_category_utf_8_sig;
8786           else
8787             this = coding_categories + coding_category_utf_8_nosig;
8788           val = list1 (make_number (this->id));
8789         }
8790     }
8791   else if (base_category == coding_category_utf_16_auto)
8792     {
8793       if (detect_coding_utf_16 (&coding, &detect_info))
8794         {
8795           struct coding_system *this;
8796
8797           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8798             this = coding_categories + coding_category_utf_16_le;
8799           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8800             this = coding_categories + coding_category_utf_16_be;
8801           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8802             this = coding_categories + coding_category_utf_16_be_nosig;
8803           else
8804             this = coding_categories + coding_category_utf_16_le_nosig;
8805           val = list1 (make_number (this->id));
8806         }
8807     }
8808   else
8809     {
8810       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8811       val = list1 (make_number (coding.id));
8812     }
8813
8814   /* Then, detect eol-format if necessary.  */
8815   {
8816     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8817     Lisp_Object tail;
8818
8819     if (VECTORP (eol_type))
8820       {
8821         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8822           {
8823             if (null_byte_found)
8824               normal_eol = EOL_SEEN_LF;
8825             else
8826               normal_eol = detect_eol (coding.source, src_bytes,
8827                                        coding_category_raw_text);
8828           }
8829         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8830                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8831           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8832                                       coding_category_utf_16_be);
8833         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8834                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8835           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8836                                       coding_category_utf_16_le);
8837       }
8838     else
8839       {
8840         if (EQ (eol_type, Qunix))
8841           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8842         else if (EQ (eol_type, Qdos))
8843           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8844         else
8845           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8846       }
8847
8848     for (tail = val; CONSP (tail); tail = XCDR (tail))
8849       {
8850         enum coding_category category;
8851         int this_eol;
8852
8853         id = XINT (XCAR (tail));
8854         attrs = CODING_ID_ATTRS (id);
8855         category = XINT (CODING_ATTR_CATEGORY (attrs));
8856         eol_type = CODING_ID_EOL_TYPE (id);
8857         if (VECTORP (eol_type))
8858           {
8859             if (category == coding_category_utf_16_be
8860                 || category == coding_category_utf_16_be_nosig)
8861               this_eol = utf_16_be_eol;
8862             else if (category == coding_category_utf_16_le
8863                      || category == coding_category_utf_16_le_nosig)
8864               this_eol = utf_16_le_eol;
8865             else
8866               this_eol = normal_eol;
8867
8868             if (this_eol == EOL_SEEN_LF)
8869               XSETCAR (tail, AREF (eol_type, 0));
8870             else if (this_eol == EOL_SEEN_CRLF)
8871               XSETCAR (tail, AREF (eol_type, 1));
8872             else if (this_eol == EOL_SEEN_CR)
8873               XSETCAR (tail, AREF (eol_type, 2));
8874             else
8875               XSETCAR (tail, CODING_ID_NAME (id));
8876           }
8877         else
8878           XSETCAR (tail, CODING_ID_NAME (id));
8879       }
8880   }
8881
8882   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8883 }
8884
8885
8886 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8887        2, 3, 0,
8888        doc: /* Detect coding system of the text in the region between START and END.
8889 Return a list of possible coding systems ordered by priority.
8890 The coding systems to try and their priorities follows what
8891 the function `coding-system-priority-list' (which see) returns.
8892
8893 If only ASCII characters are found (except for such ISO-2022 control
8894 characters as ESC), it returns a list of single element `undecided'
8895 or its subsidiary coding system according to a detected end-of-line
8896 format.
8897
8898 If optional argument HIGHEST is non-nil, return the coding system of
8899 highest priority.  */)
8900   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8901 {
8902   ptrdiff_t from, to;
8903   ptrdiff_t from_byte, to_byte;
8904
8905   validate_region (&start, &end);
8906   from = XINT (start), to = XINT (end);
8907   from_byte = CHAR_TO_BYTE (from);
8908   to_byte = CHAR_TO_BYTE (to);
8909
8910   if (from < GPT && to >= GPT)
8911     move_gap_both (to, to_byte);
8912
8913   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8914                                to - from, to_byte - from_byte,
8915                                !NILP (highest),
8916                                !NILP (BVAR (current_buffer
8917                                       , enable_multibyte_characters)),
8918                                Qnil);
8919 }
8920
8921 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8922        1, 2, 0,
8923        doc: /* Detect coding system of the text in STRING.
8924 Return a list of possible coding systems ordered by priority.
8925 The coding systems to try and their priorities follows what
8926 the function `coding-system-priority-list' (which see) returns.
8927
8928 If only ASCII characters are found (except for such ISO-2022 control
8929 characters as ESC), it returns a list of single element `undecided'
8930 or its subsidiary coding system according to a detected end-of-line
8931 format.
8932
8933 If optional argument HIGHEST is non-nil, return the coding system of
8934 highest priority.  */)
8935   (Lisp_Object string, Lisp_Object highest)
8936 {
8937   CHECK_STRING (string);
8938
8939   return detect_coding_system (SDATA (string),
8940                                SCHARS (string), SBYTES (string),
8941                                !NILP (highest), STRING_MULTIBYTE (string),
8942                                Qnil);
8943 }
8944
8945
8946 static bool
8947 char_encodable_p (int c, Lisp_Object attrs)
8948 {
8949   Lisp_Object tail;
8950   struct charset *charset;
8951   Lisp_Object translation_table;
8952
8953   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8954   if (! NILP (translation_table))
8955     c = translate_char (translation_table, c);
8956   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8957        CONSP (tail); tail = XCDR (tail))
8958     {
8959       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8960       if (CHAR_CHARSET_P (c, charset))
8961         break;
8962     }
8963   return (! NILP (tail));
8964 }
8965
8966
8967 /* Return a list of coding systems that safely encode the text between
8968    START and END.  If EXCLUDE is non-nil, it is a list of coding
8969    systems not to check.  The returned list doesn't contain any such
8970    coding systems.  In any case, if the text contains only ASCII or is
8971    unibyte, return t.  */
8972
8973 DEFUN ("find-coding-systems-region-internal",
8974        Ffind_coding_systems_region_internal,
8975        Sfind_coding_systems_region_internal, 2, 3, 0,
8976        doc: /* Internal use only.  */)
8977   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8978 {
8979   Lisp_Object coding_attrs_list, safe_codings;
8980   ptrdiff_t start_byte, end_byte;
8981   const unsigned char *p, *pbeg, *pend;
8982   int c;
8983   Lisp_Object tail, elt, work_table;
8984
8985   if (STRINGP (start))
8986     {
8987       if (!STRING_MULTIBYTE (start)
8988           || SCHARS (start) == SBYTES (start))
8989         return Qt;
8990       start_byte = 0;
8991       end_byte = SBYTES (start);
8992     }
8993   else
8994     {
8995       CHECK_NUMBER_COERCE_MARKER (start);
8996       CHECK_NUMBER_COERCE_MARKER (end);
8997       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8998         args_out_of_range (start, end);
8999       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9000         return Qt;
9001       start_byte = CHAR_TO_BYTE (XINT (start));
9002       end_byte = CHAR_TO_BYTE (XINT (end));
9003       if (XINT (end) - XINT (start) == end_byte - start_byte)
9004         return Qt;
9005
9006       if (XINT (start) < GPT && XINT (end) > GPT)
9007         {
9008           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9009             move_gap_both (XINT (start), start_byte);
9010           else
9011             move_gap_both (XINT (end), end_byte);
9012         }
9013     }
9014
9015   coding_attrs_list = Qnil;
9016   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9017     if (NILP (exclude)
9018         || NILP (Fmemq (XCAR (tail), exclude)))
9019       {
9020         Lisp_Object attrs;
9021
9022         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9023         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9024           {
9025             ASET (attrs, coding_attr_trans_tbl,
9026                   get_translation_table (attrs, 1, NULL));
9027             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9028           }
9029       }
9030
9031   if (STRINGP (start))
9032     p = pbeg = SDATA (start);
9033   else
9034     p = pbeg = BYTE_POS_ADDR (start_byte);
9035   pend = p + (end_byte - start_byte);
9036
9037   while (p < pend && ASCII_CHAR_P (*p)) p++;
9038   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9039
9040   work_table = Fmake_char_table (Qnil, Qnil);
9041   while (p < pend)
9042     {
9043       if (ASCII_CHAR_P (*p))
9044         p++;
9045       else
9046         {
9047           c = STRING_CHAR_ADVANCE (p);
9048           if (!NILP (char_table_ref (work_table, c)))
9049             /* This character was already checked.  Ignore it.  */
9050             continue;
9051
9052           charset_map_loaded = 0;
9053           for (tail = coding_attrs_list; CONSP (tail);)
9054             {
9055               elt = XCAR (tail);
9056               if (NILP (elt))
9057                 tail = XCDR (tail);
9058               else if (char_encodable_p (c, elt))
9059                 tail = XCDR (tail);
9060               else if (CONSP (XCDR (tail)))
9061                 {
9062                   XSETCAR (tail, XCAR (XCDR (tail)));
9063                   XSETCDR (tail, XCDR (XCDR (tail)));
9064                 }
9065               else
9066                 {
9067                   XSETCAR (tail, Qnil);
9068                   tail = XCDR (tail);
9069                 }
9070             }
9071           if (charset_map_loaded)
9072             {
9073               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9074
9075               if (STRINGP (start))
9076                 pbeg = SDATA (start);
9077               else
9078                 pbeg = BYTE_POS_ADDR (start_byte);
9079               p = pbeg + p_offset;
9080               pend = pbeg + pend_offset;
9081             }
9082           char_table_set (work_table, c, Qt);
9083         }
9084     }
9085
9086   safe_codings = list2 (Qraw_text, Qno_conversion);
9087   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9088     if (! NILP (XCAR (tail)))
9089       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9090
9091   return safe_codings;
9092 }
9093
9094
9095 DEFUN ("unencodable-char-position", Funencodable_char_position,
9096        Sunencodable_char_position, 3, 5, 0,
9097        doc: /* Return position of first un-encodable character in a region.
9098 START and END specify the region and CODING-SYSTEM specifies the
9099 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9100
9101 If optional 4th argument COUNT is non-nil, it specifies at most how
9102 many un-encodable characters to search.  In this case, the value is a
9103 list of positions.
9104
9105 If optional 5th argument STRING is non-nil, it is a string to search
9106 for un-encodable characters.  In that case, START and END are indexes
9107 to the string and treated as in `substring'.  */)
9108   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9109    Lisp_Object count, Lisp_Object string)
9110 {
9111   EMACS_INT n;
9112   struct coding_system coding;
9113   Lisp_Object attrs, charset_list, translation_table;
9114   Lisp_Object positions;
9115   ptrdiff_t from, to;
9116   const unsigned char *p, *stop, *pend;
9117   bool ascii_compatible;
9118
9119   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9120   attrs = CODING_ID_ATTRS (coding.id);
9121   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9122     return Qnil;
9123   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9124   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9125   translation_table = get_translation_table (attrs, 1, NULL);
9126
9127   if (NILP (string))
9128     {
9129       validate_region (&start, &end);
9130       from = XINT (start);
9131       to = XINT (end);
9132       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9133           || (ascii_compatible
9134               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9135         return Qnil;
9136       p = CHAR_POS_ADDR (from);
9137       pend = CHAR_POS_ADDR (to);
9138       if (from < GPT && to >= GPT)
9139         stop = GPT_ADDR;
9140       else
9141         stop = pend;
9142     }
9143   else
9144     {
9145       CHECK_STRING (string);
9146       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9147       if (! STRING_MULTIBYTE (string))
9148         return Qnil;
9149       p = SDATA (string) + string_char_to_byte (string, from);
9150       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9151       if (ascii_compatible && (to - from) == (pend - p))
9152         return Qnil;
9153     }
9154
9155   if (NILP (count))
9156     n = 1;
9157   else
9158     {
9159       CHECK_NATNUM (count);
9160       n = XINT (count);
9161     }
9162
9163   positions = Qnil;
9164   charset_map_loaded = 0;
9165   while (1)
9166     {
9167       int c;
9168
9169       if (ascii_compatible)
9170         while (p < stop && ASCII_CHAR_P (*p))
9171           p++, from++;
9172       if (p >= stop)
9173         {
9174           if (p >= pend)
9175             break;
9176           stop = pend;
9177           p = GAP_END_ADDR;
9178         }
9179
9180       c = STRING_CHAR_ADVANCE (p);
9181       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9182           && ! char_charset (translate_char (translation_table, c),
9183                              charset_list, NULL))
9184         {
9185           positions = Fcons (make_number (from), positions);
9186           n--;
9187           if (n == 0)
9188             break;
9189         }
9190
9191       from++;
9192       if (charset_map_loaded && NILP (string))
9193         {
9194           p = CHAR_POS_ADDR (from);
9195           pend = CHAR_POS_ADDR (to);
9196           if (from < GPT && to >= GPT)
9197             stop = GPT_ADDR;
9198           else
9199             stop = pend;
9200           charset_map_loaded = 0;
9201         }
9202     }
9203
9204   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9205 }
9206
9207
9208 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9209        Scheck_coding_systems_region, 3, 3, 0,
9210        doc: /* Check if the region is encodable by coding systems.
9211
9212 START and END are buffer positions specifying the region.
9213 CODING-SYSTEM-LIST is a list of coding systems to check.
9214
9215 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9216 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9217 whole region, POS0, POS1, ... are buffer positions where non-encodable
9218 characters are found.
9219
9220 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9221 value is nil.
9222
9223 START may be a string.  In that case, check if the string is
9224 encodable, and the value contains indices to the string instead of
9225 buffer positions.  END is ignored.
9226
9227 If the current buffer (or START if it is a string) is unibyte, the value
9228 is nil.  */)
9229   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9230 {
9231   Lisp_Object list;
9232   ptrdiff_t start_byte, end_byte;
9233   ptrdiff_t pos;
9234   const unsigned char *p, *pbeg, *pend;
9235   int c;
9236   Lisp_Object tail, elt, attrs;
9237
9238   if (STRINGP (start))
9239     {
9240       if (!STRING_MULTIBYTE (start)
9241           || SCHARS (start) == SBYTES (start))
9242         return Qnil;
9243       start_byte = 0;
9244       end_byte = SBYTES (start);
9245       pos = 0;
9246     }
9247   else
9248     {
9249       CHECK_NUMBER_COERCE_MARKER (start);
9250       CHECK_NUMBER_COERCE_MARKER (end);
9251       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9252         args_out_of_range (start, end);
9253       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9254         return Qnil;
9255       start_byte = CHAR_TO_BYTE (XINT (start));
9256       end_byte = CHAR_TO_BYTE (XINT (end));
9257       if (XINT (end) - XINT (start) == end_byte - start_byte)
9258         return Qnil;
9259
9260       if (XINT (start) < GPT && XINT (end) > GPT)
9261         {
9262           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9263             move_gap_both (XINT (start), start_byte);
9264           else
9265             move_gap_both (XINT (end), end_byte);
9266         }
9267       pos = XINT (start);
9268     }
9269
9270   list = Qnil;
9271   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9272     {
9273       elt = XCAR (tail);
9274       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9275       ASET (attrs, coding_attr_trans_tbl,
9276             get_translation_table (attrs, 1, NULL));
9277       list = Fcons (list2 (elt, attrs), list);
9278     }
9279
9280   if (STRINGP (start))
9281     p = pbeg = SDATA (start);
9282   else
9283     p = pbeg = BYTE_POS_ADDR (start_byte);
9284   pend = p + (end_byte - start_byte);
9285
9286   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9287   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9288
9289   while (p < pend)
9290     {
9291       if (ASCII_CHAR_P (*p))
9292         p++;
9293       else
9294         {
9295           c = STRING_CHAR_ADVANCE (p);
9296
9297           charset_map_loaded = 0;
9298           for (tail = list; CONSP (tail); tail = XCDR (tail))
9299             {
9300               elt = XCDR (XCAR (tail));
9301               if (! char_encodable_p (c, XCAR (elt)))
9302                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9303             }
9304           if (charset_map_loaded)
9305             {
9306               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9307
9308               if (STRINGP (start))
9309                 pbeg = SDATA (start);
9310               else
9311                 pbeg = BYTE_POS_ADDR (start_byte);
9312               p = pbeg + p_offset;
9313               pend = pbeg + pend_offset;
9314             }
9315         }
9316       pos++;
9317     }
9318
9319   tail = list;
9320   list = Qnil;
9321   for (; CONSP (tail); tail = XCDR (tail))
9322     {
9323       elt = XCAR (tail);
9324       if (CONSP (XCDR (XCDR (elt))))
9325         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9326                       list);
9327     }
9328
9329   return list;
9330 }
9331
9332
9333 static Lisp_Object
9334 code_convert_region (Lisp_Object start, Lisp_Object end,
9335                      Lisp_Object coding_system, Lisp_Object dst_object,
9336                      bool encodep, bool norecord)
9337 {
9338   struct coding_system coding;
9339   ptrdiff_t from, from_byte, to, to_byte;
9340   Lisp_Object src_object;
9341
9342   if (NILP (coding_system))
9343     coding_system = Qno_conversion;
9344   else
9345     CHECK_CODING_SYSTEM (coding_system);
9346   src_object = Fcurrent_buffer ();
9347   if (NILP (dst_object))
9348     dst_object = src_object;
9349   else if (! EQ (dst_object, Qt))
9350     CHECK_BUFFER (dst_object);
9351
9352   validate_region (&start, &end);
9353   from = XFASTINT (start);
9354   from_byte = CHAR_TO_BYTE (from);
9355   to = XFASTINT (end);
9356   to_byte = CHAR_TO_BYTE (to);
9357
9358   setup_coding_system (coding_system, &coding);
9359   coding.mode |= CODING_MODE_LAST_BLOCK;
9360
9361   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9362     {
9363       struct buffer *buf = XBUFFER (dst_object);
9364       ptrdiff_t buf_pt = BUF_PT (buf);
9365
9366       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9367     }
9368
9369   if (encodep)
9370     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9371                           dst_object);
9372   else
9373     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9374                           dst_object);
9375   if (! norecord)
9376     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9377
9378   return (BUFFERP (dst_object)
9379           ? make_number (coding.produced_char)
9380           : coding.dst_object);
9381 }
9382
9383
9384 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9385        3, 4, "r\nzCoding system: ",
9386        doc: /* Decode the current region from the specified coding system.
9387 When called from a program, takes four arguments:
9388         START, END, CODING-SYSTEM, and DESTINATION.
9389 START and END are buffer positions.
9390
9391 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9392 If nil, the region between START and END is replaced by the decoded text.
9393 If buffer, the decoded text is inserted in that buffer after point (point
9394 does not move).
9395 In those cases, the length of the decoded text is returned.
9396 If DESTINATION is t, the decoded text is returned.
9397
9398 This function sets `last-coding-system-used' to the precise coding system
9399 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9400 not fully specified.)  */)
9401   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9402 {
9403   return code_convert_region (start, end, coding_system, destination, 0, 0);
9404 }
9405
9406 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9407        3, 4, "r\nzCoding system: ",
9408        doc: /* Encode the current region by specified coding system.
9409 When called from a program, takes four arguments:
9410         START, END, CODING-SYSTEM and DESTINATION.
9411 START and END are buffer positions.
9412
9413 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9414 If nil, the region between START and END is replace by the encoded text.
9415 If buffer, the encoded text is inserted in that buffer after point (point
9416 does not move).
9417 In those cases, the length of the encoded text is returned.
9418 If DESTINATION is t, the encoded text is returned.
9419
9420 This function sets `last-coding-system-used' to the precise coding system
9421 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9422 not fully specified.)  */)
9423   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9424 {
9425   return code_convert_region (start, end, coding_system, destination, 1, 0);
9426 }
9427
9428 Lisp_Object
9429 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9430                      Lisp_Object dst_object, bool encodep, bool nocopy,
9431                      bool norecord)
9432 {
9433   struct coding_system coding;
9434   ptrdiff_t chars, bytes;
9435
9436   CHECK_STRING (string);
9437   if (NILP (coding_system))
9438     {
9439       if (! norecord)
9440         Vlast_coding_system_used = Qno_conversion;
9441       if (NILP (dst_object))
9442         return (nocopy ? Fcopy_sequence (string) : string);
9443     }
9444
9445   if (NILP (coding_system))
9446     coding_system = Qno_conversion;
9447   else
9448     CHECK_CODING_SYSTEM (coding_system);
9449   if (NILP (dst_object))
9450     dst_object = Qt;
9451   else if (! EQ (dst_object, Qt))
9452     CHECK_BUFFER (dst_object);
9453
9454   setup_coding_system (coding_system, &coding);
9455   coding.mode |= CODING_MODE_LAST_BLOCK;
9456   chars = SCHARS (string);
9457   bytes = SBYTES (string);
9458
9459   if (BUFFERP (dst_object))
9460     {
9461       struct buffer *buf = XBUFFER (dst_object);
9462       ptrdiff_t buf_pt = BUF_PT (buf);
9463
9464       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9465     }
9466
9467   if (encodep)
9468     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9469   else
9470     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9471   if (! norecord)
9472     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9473
9474   return (BUFFERP (dst_object)
9475           ? make_number (coding.produced_char)
9476           : coding.dst_object);
9477 }
9478
9479
9480 /* Encode or decode STRING according to CODING_SYSTEM.
9481    Do not set Vlast_coding_system_used.
9482
9483    This function is called only from macros DECODE_FILE and
9484    ENCODE_FILE, thus we ignore character composition.  */
9485
9486 Lisp_Object
9487 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9488                               bool encodep)
9489 {
9490   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9491 }
9492
9493 /* Encode or decode a file name, to or from a unibyte string suitable
9494    for passing to C library functions.  */
9495 Lisp_Object
9496 decode_file_name (Lisp_Object fname)
9497 {
9498 #ifdef WINDOWSNT
9499   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9500      converts the file names either to UTF-16LE or to the system ANSI
9501      codepage internally, depending on the underlying OS; see w32.c.  */
9502   if (! NILP (Fcoding_system_p (Qutf_8)))
9503     return code_convert_string_norecord (fname, Qutf_8, 0);
9504   return fname;
9505 #else  /* !WINDOWSNT */
9506   if (! NILP (Vfile_name_coding_system))
9507     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9508   else if (! NILP (Vdefault_file_name_coding_system))
9509     return code_convert_string_norecord (fname,
9510                                          Vdefault_file_name_coding_system, 0);
9511   else
9512     return fname;
9513 #endif
9514 }
9515
9516 Lisp_Object
9517 encode_file_name (Lisp_Object fname)
9518 {
9519   /* This is especially important during bootstrap and dumping, when
9520      file-name encoding is not yet known, and therefore any non-ASCII
9521      file names are unibyte strings, and could only be thrashed if we
9522      try to encode them.  */
9523   if (!STRING_MULTIBYTE (fname))
9524     return fname;
9525 #ifdef WINDOWSNT
9526   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9527      converts the file names either to UTF-16LE or to the system ANSI
9528      codepage internally, depending on the underlying OS; see w32.c.  */
9529   if (! NILP (Fcoding_system_p (Qutf_8)))
9530     return code_convert_string_norecord (fname, Qutf_8, 1);
9531   return fname;
9532 #else  /* !WINDOWSNT */
9533   if (! NILP (Vfile_name_coding_system))
9534     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9535   else if (! NILP (Vdefault_file_name_coding_system))
9536     return code_convert_string_norecord (fname,
9537                                          Vdefault_file_name_coding_system, 1);
9538   else
9539     return fname;
9540 #endif
9541 }
9542
9543 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9544        2, 4, 0,
9545        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9546
9547 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9548 if the decoding operation is trivial.
9549
9550 Optional fourth arg BUFFER non-nil means that the decoded text is
9551 inserted in that buffer after point (point does not move).  In this
9552 case, the return value is the length of the decoded text.
9553
9554 This function sets `last-coding-system-used' to the precise coding system
9555 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9556 not fully specified.)  */)
9557   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9558 {
9559   return code_convert_string (string, coding_system, buffer,
9560                               0, ! NILP (nocopy), 0);
9561 }
9562
9563 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9564        2, 4, 0,
9565        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9566
9567 Optional third arg NOCOPY non-nil means it is OK to return STRING
9568 itself if the encoding operation is trivial.
9569
9570 Optional fourth arg BUFFER non-nil means that the encoded text is
9571 inserted in that buffer after point (point does not move).  In this
9572 case, the return value is the length of the encoded text.
9573
9574 This function sets `last-coding-system-used' to the precise coding system
9575 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9576 not fully specified.)  */)
9577   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9578 {
9579   return code_convert_string (string, coding_system, buffer,
9580                               1, ! NILP (nocopy), 0);
9581 }
9582
9583 \f
9584 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9585        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9586 Return the corresponding character.  */)
9587   (Lisp_Object code)
9588 {
9589   Lisp_Object spec, attrs, val;
9590   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9591   EMACS_INT ch;
9592   int c;
9593
9594   CHECK_NATNUM (code);
9595   ch = XFASTINT (code);
9596   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9597   attrs = AREF (spec, 0);
9598
9599   if (ASCII_CHAR_P (ch)
9600       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9601     return code;
9602
9603   val = CODING_ATTR_CHARSET_LIST (attrs);
9604   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9605   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9606   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9607
9608   if (ch <= 0x7F)
9609     {
9610       c = ch;
9611       charset = charset_roman;
9612     }
9613   else if (ch >= 0xA0 && ch < 0xDF)
9614     {
9615       c = ch - 0x80;
9616       charset = charset_kana;
9617     }
9618   else
9619     {
9620       EMACS_INT c1 = ch >> 8;
9621       int c2 = ch & 0xFF;
9622
9623       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9624           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9625         error ("Invalid code: %"pI"d", ch);
9626       c = ch;
9627       SJIS_TO_JIS (c);
9628       charset = charset_kanji;
9629     }
9630   c = DECODE_CHAR (charset, c);
9631   if (c < 0)
9632     error ("Invalid code: %"pI"d", ch);
9633   return make_number (c);
9634 }
9635
9636
9637 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9638        doc: /* Encode a Japanese character CH to shift_jis encoding.
9639 Return the corresponding code in SJIS.  */)
9640   (Lisp_Object ch)
9641 {
9642   Lisp_Object spec, attrs, charset_list;
9643   int c;
9644   struct charset *charset;
9645   unsigned code;
9646
9647   CHECK_CHARACTER (ch);
9648   c = XFASTINT (ch);
9649   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9650   attrs = AREF (spec, 0);
9651
9652   if (ASCII_CHAR_P (c)
9653       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9654     return ch;
9655
9656   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9657   charset = char_charset (c, charset_list, &code);
9658   if (code == CHARSET_INVALID_CODE (charset))
9659     error ("Can't encode by shift_jis encoding: %c", c);
9660   JIS_TO_SJIS (code);
9661
9662   return make_number (code);
9663 }
9664
9665 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9666        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9667 Return the corresponding character.  */)
9668   (Lisp_Object code)
9669 {
9670   Lisp_Object spec, attrs, val;
9671   struct charset *charset_roman, *charset_big5, *charset;
9672   EMACS_INT ch;
9673   int c;
9674
9675   CHECK_NATNUM (code);
9676   ch = XFASTINT (code);
9677   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9678   attrs = AREF (spec, 0);
9679
9680   if (ASCII_CHAR_P (ch)
9681       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9682     return code;
9683
9684   val = CODING_ATTR_CHARSET_LIST (attrs);
9685   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9686   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9687
9688   if (ch <= 0x7F)
9689     {
9690       c = ch;
9691       charset = charset_roman;
9692     }
9693   else
9694     {
9695       EMACS_INT b1 = ch >> 8;
9696       int b2 = ch & 0x7F;
9697       if (b1 < 0xA1 || b1 > 0xFE
9698           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9699         error ("Invalid code: %"pI"d", ch);
9700       c = ch;
9701       charset = charset_big5;
9702     }
9703   c = DECODE_CHAR (charset, c);
9704   if (c < 0)
9705     error ("Invalid code: %"pI"d", ch);
9706   return make_number (c);
9707 }
9708
9709 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9710        doc: /* Encode the Big5 character CH to BIG5 coding system.
9711 Return the corresponding character code in Big5.  */)
9712   (Lisp_Object ch)
9713 {
9714   Lisp_Object spec, attrs, charset_list;
9715   struct charset *charset;
9716   int c;
9717   unsigned code;
9718
9719   CHECK_CHARACTER (ch);
9720   c = XFASTINT (ch);
9721   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9722   attrs = AREF (spec, 0);
9723   if (ASCII_CHAR_P (c)
9724       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9725     return ch;
9726
9727   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9728   charset = char_charset (c, charset_list, &code);
9729   if (code == CHARSET_INVALID_CODE (charset))
9730     error ("Can't encode by Big5 encoding: %c", c);
9731
9732   return make_number (code);
9733 }
9734
9735 \f
9736 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9737        Sset_terminal_coding_system_internal, 1, 2, 0,
9738        doc: /* Internal use only.  */)
9739   (Lisp_Object coding_system, Lisp_Object terminal)
9740 {
9741   struct terminal *term = get_terminal (terminal, 1);
9742   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9743   CHECK_SYMBOL (coding_system);
9744   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9745   /* We had better not send unsafe characters to terminal.  */
9746   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9747   /* Character composition should be disabled.  */
9748   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9749   terminal_coding->src_multibyte = 1;
9750   terminal_coding->dst_multibyte = 0;
9751   tset_charset_list
9752     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9753             ? coding_charset_list (terminal_coding)
9754             : list1 (make_number (charset_ascii))));
9755   return Qnil;
9756 }
9757
9758 DEFUN ("set-safe-terminal-coding-system-internal",
9759        Fset_safe_terminal_coding_system_internal,
9760        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9761        doc: /* Internal use only.  */)
9762   (Lisp_Object coding_system)
9763 {
9764   CHECK_SYMBOL (coding_system);
9765   setup_coding_system (Fcheck_coding_system (coding_system),
9766                        &safe_terminal_coding);
9767   /* Character composition should be disabled.  */
9768   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9769   safe_terminal_coding.src_multibyte = 1;
9770   safe_terminal_coding.dst_multibyte = 0;
9771   return Qnil;
9772 }
9773
9774 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9775        Sterminal_coding_system, 0, 1, 0,
9776        doc: /* Return coding system specified for terminal output on the given terminal.
9777 TERMINAL may be a terminal object, a frame, or nil for the selected
9778 frame's terminal device.  */)
9779   (Lisp_Object terminal)
9780 {
9781   struct coding_system *terminal_coding
9782     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9783   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9784
9785   /* For backward compatibility, return nil if it is `undecided'.  */
9786   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9787 }
9788
9789 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9790        Sset_keyboard_coding_system_internal, 1, 2, 0,
9791        doc: /* Internal use only.  */)
9792   (Lisp_Object coding_system, Lisp_Object terminal)
9793 {
9794   struct terminal *t = get_terminal (terminal, 1);
9795   CHECK_SYMBOL (coding_system);
9796   if (NILP (coding_system))
9797     coding_system = Qno_conversion;
9798   else
9799     Fcheck_coding_system (coding_system);
9800   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9801   /* Character composition should be disabled.  */
9802   TERMINAL_KEYBOARD_CODING (t)->common_flags
9803     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9804   return Qnil;
9805 }
9806
9807 DEFUN ("keyboard-coding-system",
9808        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9809        doc: /* Return coding system specified for decoding keyboard input.  */)
9810   (Lisp_Object terminal)
9811 {
9812   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9813                          (get_terminal (terminal, 1))->id);
9814 }
9815
9816 \f
9817 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9818        Sfind_operation_coding_system,  1, MANY, 0,
9819        doc: /* Choose a coding system for an operation based on the target name.
9820 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9821 DECODING-SYSTEM is the coding system to use for decoding
9822 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9823 for encoding (in case OPERATION does encoding).
9824
9825 The first argument OPERATION specifies an I/O primitive:
9826   For file I/O, `insert-file-contents' or `write-region'.
9827   For process I/O, `call-process', `call-process-region', or `start-process'.
9828   For network I/O, `open-network-stream'.
9829
9830 The remaining arguments should be the same arguments that were passed
9831 to the primitive.  Depending on which primitive, one of those arguments
9832 is selected as the TARGET.  For example, if OPERATION does file I/O,
9833 whichever argument specifies the file name is TARGET.
9834
9835 TARGET has a meaning which depends on OPERATION:
9836   For file I/O, TARGET is a file name (except for the special case below).
9837   For process I/O, TARGET is a process name.
9838   For network I/O, TARGET is a service name or a port number.
9839
9840 This function looks up what is specified for TARGET in
9841 `file-coding-system-alist', `process-coding-system-alist',
9842 or `network-coding-system-alist' depending on OPERATION.
9843 They may specify a coding system, a cons of coding systems,
9844 or a function symbol to call.
9845 In the last case, we call the function with one argument,
9846 which is a list of all the arguments given to this function.
9847 If the function can't decide a coding system, it can return
9848 `undecided' so that the normal code-detection is performed.
9849
9850 If OPERATION is `insert-file-contents', the argument corresponding to
9851 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9852 file name to look up, and BUFFER is a buffer that contains the file's
9853 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9854 function to call for FILENAME, that function should examine the
9855 contents of BUFFER instead of reading the file.
9856
9857 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9858   (ptrdiff_t nargs, Lisp_Object *args)
9859 {
9860   Lisp_Object operation, target_idx, target, val;
9861   register Lisp_Object chain;
9862
9863   if (nargs < 2)
9864     error ("Too few arguments");
9865   operation = args[0];
9866   if (!SYMBOLP (operation)
9867       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9868     error ("Invalid first argument");
9869   if (nargs <= 1 + XFASTINT (target_idx))
9870     error ("Too few arguments for operation `%s'",
9871            SDATA (SYMBOL_NAME (operation)));
9872   target = args[XFASTINT (target_idx) + 1];
9873   if (!(STRINGP (target)
9874         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9875             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9876         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9877     error ("Invalid argument %"pI"d of operation `%s'",
9878            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9879   if (CONSP (target))
9880     target = XCAR (target);
9881
9882   chain = ((EQ (operation, Qinsert_file_contents)
9883             || EQ (operation, Qwrite_region))
9884            ? Vfile_coding_system_alist
9885            : (EQ (operation, Qopen_network_stream)
9886               ? Vnetwork_coding_system_alist
9887               : Vprocess_coding_system_alist));
9888   if (NILP (chain))
9889     return Qnil;
9890
9891   for (; CONSP (chain); chain = XCDR (chain))
9892     {
9893       Lisp_Object elt;
9894
9895       elt = XCAR (chain);
9896       if (CONSP (elt)
9897           && ((STRINGP (target)
9898                && STRINGP (XCAR (elt))
9899                && fast_string_match (XCAR (elt), target) >= 0)
9900               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9901         {
9902           val = XCDR (elt);
9903           /* Here, if VAL is both a valid coding system and a valid
9904              function symbol, we return VAL as a coding system.  */
9905           if (CONSP (val))
9906             return val;
9907           if (! SYMBOLP (val))
9908             return Qnil;
9909           if (! NILP (Fcoding_system_p (val)))
9910             return Fcons (val, val);
9911           if (! NILP (Ffboundp (val)))
9912             {
9913               /* We use call1 rather than safe_call1
9914                  so as to get bug reports about functions called here
9915                  which don't handle the current interface.  */
9916               val = call1 (val, Flist (nargs, args));
9917               if (CONSP (val))
9918                 return val;
9919               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9920                 return Fcons (val, val);
9921             }
9922           return Qnil;
9923         }
9924     }
9925   return Qnil;
9926 }
9927
9928 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9929        Sset_coding_system_priority, 0, MANY, 0,
9930        doc: /* Assign higher priority to the coding systems given as arguments.
9931 If multiple coding systems belong to the same category,
9932 all but the first one are ignored.
9933
9934 usage: (set-coding-system-priority &rest coding-systems)  */)
9935   (ptrdiff_t nargs, Lisp_Object *args)
9936 {
9937   ptrdiff_t i, j;
9938   bool changed[coding_category_max];
9939   enum coding_category priorities[coding_category_max];
9940
9941   memset (changed, 0, sizeof changed);
9942
9943   for (i = j = 0; i < nargs; i++)
9944     {
9945       enum coding_category category;
9946       Lisp_Object spec, attrs;
9947
9948       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9949       attrs = AREF (spec, 0);
9950       category = XINT (CODING_ATTR_CATEGORY (attrs));
9951       if (changed[category])
9952         /* Ignore this coding system because a coding system of the
9953            same category already had a higher priority.  */
9954         continue;
9955       changed[category] = 1;
9956       priorities[j++] = category;
9957       if (coding_categories[category].id >= 0
9958           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9959         setup_coding_system (args[i], &coding_categories[category]);
9960       Fset (AREF (Vcoding_category_table, category), args[i]);
9961     }
9962
9963   /* Now we have decided top J priorities.  Reflect the order of the
9964      original priorities to the remaining priorities.  */
9965
9966   for (i = j, j = 0; i < coding_category_max; i++, j++)
9967     {
9968       while (j < coding_category_max
9969              && changed[coding_priorities[j]])
9970         j++;
9971       if (j == coding_category_max)
9972         emacs_abort ();
9973       priorities[i] = coding_priorities[j];
9974     }
9975
9976   memcpy (coding_priorities, priorities, sizeof priorities);
9977
9978   /* Update `coding-category-list'.  */
9979   Vcoding_category_list = Qnil;
9980   for (i = coding_category_max; i-- > 0; )
9981     Vcoding_category_list
9982       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9983                Vcoding_category_list);
9984
9985   return Qnil;
9986 }
9987
9988 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9989        Scoding_system_priority_list, 0, 1, 0,
9990        doc: /* Return a list of coding systems ordered by their priorities.
9991 The list contains a subset of coding systems; i.e. coding systems
9992 assigned to each coding category (see `coding-category-list').
9993
9994 HIGHESTP non-nil means just return the highest priority one.  */)
9995   (Lisp_Object highestp)
9996 {
9997   int i;
9998   Lisp_Object val;
9999
10000   for (i = 0, val = Qnil; i < coding_category_max; i++)
10001     {
10002       enum coding_category category = coding_priorities[i];
10003       int id = coding_categories[category].id;
10004       Lisp_Object attrs;
10005
10006       if (id < 0)
10007         continue;
10008       attrs = CODING_ID_ATTRS (id);
10009       if (! NILP (highestp))
10010         return CODING_ATTR_BASE_NAME (attrs);
10011       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10012     }
10013   return Fnreverse (val);
10014 }
10015
10016 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10017
10018 static Lisp_Object
10019 make_subsidiaries (Lisp_Object base)
10020 {
10021   Lisp_Object subsidiaries;
10022   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10023   char *buf = alloca (base_name_len + 6);
10024   int i;
10025
10026   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10027   subsidiaries = make_uninit_vector (3);
10028   for (i = 0; i < 3; i++)
10029     {
10030       strcpy (buf + base_name_len, suffixes[i]);
10031       ASET (subsidiaries, i, intern (buf));
10032     }
10033   return subsidiaries;
10034 }
10035
10036
10037 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10038        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10039        doc: /* For internal use only.
10040 usage: (define-coding-system-internal ...)  */)
10041   (ptrdiff_t nargs, Lisp_Object *args)
10042 {
10043   Lisp_Object name;
10044   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10045   Lisp_Object attrs;            /* Vector of attributes.  */
10046   Lisp_Object eol_type;
10047   Lisp_Object aliases;
10048   Lisp_Object coding_type, charset_list, safe_charsets;
10049   enum coding_category category;
10050   Lisp_Object tail, val;
10051   int max_charset_id = 0;
10052   int i;
10053
10054   if (nargs < coding_arg_max)
10055     goto short_args;
10056
10057   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10058
10059   name = args[coding_arg_name];
10060   CHECK_SYMBOL (name);
10061   ASET (attrs, coding_attr_base_name, name);
10062
10063   val = args[coding_arg_mnemonic];
10064   if (! STRINGP (val))
10065     CHECK_CHARACTER (val);
10066   ASET (attrs, coding_attr_mnemonic, val);
10067
10068   coding_type = args[coding_arg_coding_type];
10069   CHECK_SYMBOL (coding_type);
10070   ASET (attrs, coding_attr_type, coding_type);
10071
10072   charset_list = args[coding_arg_charset_list];
10073   if (SYMBOLP (charset_list))
10074     {
10075       if (EQ (charset_list, Qiso_2022))
10076         {
10077           if (! EQ (coding_type, Qiso_2022))
10078             error ("Invalid charset-list");
10079           charset_list = Viso_2022_charset_list;
10080         }
10081       else if (EQ (charset_list, Qemacs_mule))
10082         {
10083           if (! EQ (coding_type, Qemacs_mule))
10084             error ("Invalid charset-list");
10085           charset_list = Vemacs_mule_charset_list;
10086         }
10087       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10088         {
10089           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10090             error ("Invalid charset-list");
10091           if (max_charset_id < XFASTINT (XCAR (tail)))
10092             max_charset_id = XFASTINT (XCAR (tail));
10093         }
10094     }
10095   else
10096     {
10097       charset_list = Fcopy_sequence (charset_list);
10098       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10099         {
10100           struct charset *charset;
10101
10102           val = XCAR (tail);
10103           CHECK_CHARSET_GET_CHARSET (val, charset);
10104           if (EQ (coding_type, Qiso_2022)
10105               ? CHARSET_ISO_FINAL (charset) < 0
10106               : EQ (coding_type, Qemacs_mule)
10107               ? CHARSET_EMACS_MULE_ID (charset) < 0
10108               : 0)
10109             error ("Can't handle charset `%s'",
10110                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10111
10112           XSETCAR (tail, make_number (charset->id));
10113           if (max_charset_id < charset->id)
10114             max_charset_id = charset->id;
10115         }
10116     }
10117   ASET (attrs, coding_attr_charset_list, charset_list);
10118
10119   safe_charsets = make_uninit_string (max_charset_id + 1);
10120   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10121   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10122     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10123   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10124
10125   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10126
10127   val = args[coding_arg_decode_translation_table];
10128   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10129     CHECK_SYMBOL (val);
10130   ASET (attrs, coding_attr_decode_tbl, val);
10131
10132   val = args[coding_arg_encode_translation_table];
10133   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10134     CHECK_SYMBOL (val);
10135   ASET (attrs, coding_attr_encode_tbl, val);
10136
10137   val = args[coding_arg_post_read_conversion];
10138   CHECK_SYMBOL (val);
10139   ASET (attrs, coding_attr_post_read, val);
10140
10141   val = args[coding_arg_pre_write_conversion];
10142   CHECK_SYMBOL (val);
10143   ASET (attrs, coding_attr_pre_write, val);
10144
10145   val = args[coding_arg_default_char];
10146   if (NILP (val))
10147     ASET (attrs, coding_attr_default_char, make_number (' '));
10148   else
10149     {
10150       CHECK_CHARACTER (val);
10151       ASET (attrs, coding_attr_default_char, val);
10152     }
10153
10154   val = args[coding_arg_for_unibyte];
10155   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10156
10157   val = args[coding_arg_plist];
10158   CHECK_LIST (val);
10159   ASET (attrs, coding_attr_plist, val);
10160
10161   if (EQ (coding_type, Qcharset))
10162     {
10163       /* Generate a lisp vector of 256 elements.  Each element is nil,
10164          integer, or a list of charset IDs.
10165
10166          If Nth element is nil, the byte code N is invalid in this
10167          coding system.
10168
10169          If Nth element is a number NUM, N is the first byte of a
10170          charset whose ID is NUM.
10171
10172          If Nth element is a list of charset IDs, N is the first byte
10173          of one of them.  The list is sorted by dimensions of the
10174          charsets.  A charset of smaller dimension comes first. */
10175       val = Fmake_vector (make_number (256), Qnil);
10176
10177       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10178         {
10179           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10180           int dim = CHARSET_DIMENSION (charset);
10181           int idx = (dim - 1) * 4;
10182
10183           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10184             ASET (attrs, coding_attr_ascii_compat, Qt);
10185
10186           for (i = charset->code_space[idx];
10187                i <= charset->code_space[idx + 1]; i++)
10188             {
10189               Lisp_Object tmp, tmp2;
10190               int dim2;
10191
10192               tmp = AREF (val, i);
10193               if (NILP (tmp))
10194                 tmp = XCAR (tail);
10195               else if (NUMBERP (tmp))
10196                 {
10197                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10198                   if (dim < dim2)
10199                     tmp = list2 (XCAR (tail), tmp);
10200                   else
10201                     tmp = list2 (tmp, XCAR (tail));
10202                 }
10203               else
10204                 {
10205                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10206                     {
10207                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10208                       if (dim < dim2)
10209                         break;
10210                     }
10211                   if (NILP (tmp2))
10212                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10213                   else
10214                     {
10215                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10216                       XSETCAR (tmp2, XCAR (tail));
10217                     }
10218                 }
10219               ASET (val, i, tmp);
10220             }
10221         }
10222       ASET (attrs, coding_attr_charset_valids, val);
10223       category = coding_category_charset;
10224     }
10225   else if (EQ (coding_type, Qccl))
10226     {
10227       Lisp_Object valids;
10228
10229       if (nargs < coding_arg_ccl_max)
10230         goto short_args;
10231
10232       val = args[coding_arg_ccl_decoder];
10233       CHECK_CCL_PROGRAM (val);
10234       if (VECTORP (val))
10235         val = Fcopy_sequence (val);
10236       ASET (attrs, coding_attr_ccl_decoder, val);
10237
10238       val = args[coding_arg_ccl_encoder];
10239       CHECK_CCL_PROGRAM (val);
10240       if (VECTORP (val))
10241         val = Fcopy_sequence (val);
10242       ASET (attrs, coding_attr_ccl_encoder, val);
10243
10244       val = args[coding_arg_ccl_valids];
10245       valids = Fmake_string (make_number (256), make_number (0));
10246       for (tail = val; CONSP (tail); tail = XCDR (tail))
10247         {
10248           int from, to;
10249
10250           val = XCAR (tail);
10251           if (INTEGERP (val))
10252             {
10253               if (! (0 <= XINT (val) && XINT (val) <= 255))
10254                 args_out_of_range_3 (val, make_number (0), make_number (255));
10255               from = to = XINT (val);
10256             }
10257           else
10258             {
10259               CHECK_CONS (val);
10260               CHECK_NATNUM_CAR (val);
10261               CHECK_NUMBER_CDR (val);
10262               if (XINT (XCAR (val)) > 255)
10263                 args_out_of_range_3 (XCAR (val),
10264                                      make_number (0), make_number (255));
10265               from = XINT (XCAR (val));
10266               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10267                 args_out_of_range_3 (XCDR (val),
10268                                      XCAR (val), make_number (255));
10269               to = XINT (XCDR (val));
10270             }
10271           for (i = from; i <= to; i++)
10272             SSET (valids, i, 1);
10273         }
10274       ASET (attrs, coding_attr_ccl_valids, valids);
10275
10276       category = coding_category_ccl;
10277     }
10278   else if (EQ (coding_type, Qutf_16))
10279     {
10280       Lisp_Object bom, endian;
10281
10282       ASET (attrs, coding_attr_ascii_compat, Qnil);
10283
10284       if (nargs < coding_arg_utf16_max)
10285         goto short_args;
10286
10287       bom = args[coding_arg_utf16_bom];
10288       if (! NILP (bom) && ! EQ (bom, Qt))
10289         {
10290           CHECK_CONS (bom);
10291           val = XCAR (bom);
10292           CHECK_CODING_SYSTEM (val);
10293           val = XCDR (bom);
10294           CHECK_CODING_SYSTEM (val);
10295         }
10296       ASET (attrs, coding_attr_utf_bom, bom);
10297
10298       endian = args[coding_arg_utf16_endian];
10299       CHECK_SYMBOL (endian);
10300       if (NILP (endian))
10301         endian = Qbig;
10302       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10303         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10304       ASET (attrs, coding_attr_utf_16_endian, endian);
10305
10306       category = (CONSP (bom)
10307                   ? coding_category_utf_16_auto
10308                   : NILP (bom)
10309                   ? (EQ (endian, Qbig)
10310                      ? coding_category_utf_16_be_nosig
10311                      : coding_category_utf_16_le_nosig)
10312                   : (EQ (endian, Qbig)
10313                      ? coding_category_utf_16_be
10314                      : coding_category_utf_16_le));
10315     }
10316   else if (EQ (coding_type, Qiso_2022))
10317     {
10318       Lisp_Object initial, reg_usage, request, flags;
10319
10320       if (nargs < coding_arg_iso2022_max)
10321         goto short_args;
10322
10323       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10324       CHECK_VECTOR (initial);
10325       for (i = 0; i < 4; i++)
10326         {
10327           val = AREF (initial, i);
10328           if (! NILP (val))
10329             {
10330               struct charset *charset;
10331
10332               CHECK_CHARSET_GET_CHARSET (val, charset);
10333               ASET (initial, i, make_number (CHARSET_ID (charset)));
10334               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10335                 ASET (attrs, coding_attr_ascii_compat, Qt);
10336             }
10337           else
10338             ASET (initial, i, make_number (-1));
10339         }
10340
10341       reg_usage = args[coding_arg_iso2022_reg_usage];
10342       CHECK_CONS (reg_usage);
10343       CHECK_NUMBER_CAR (reg_usage);
10344       CHECK_NUMBER_CDR (reg_usage);
10345
10346       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10347       for (tail = request; CONSP (tail); tail = XCDR (tail))
10348         {
10349           int id;
10350           Lisp_Object tmp1;
10351
10352           val = XCAR (tail);
10353           CHECK_CONS (val);
10354           tmp1 = XCAR (val);
10355           CHECK_CHARSET_GET_ID (tmp1, id);
10356           CHECK_NATNUM_CDR (val);
10357           if (XINT (XCDR (val)) >= 4)
10358             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10359           XSETCAR (val, make_number (id));
10360         }
10361
10362       flags = args[coding_arg_iso2022_flags];
10363       CHECK_NATNUM (flags);
10364       i = XINT (flags) & INT_MAX;
10365       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10366         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10367       flags = make_number (i);
10368
10369       ASET (attrs, coding_attr_iso_initial, initial);
10370       ASET (attrs, coding_attr_iso_usage, reg_usage);
10371       ASET (attrs, coding_attr_iso_request, request);
10372       ASET (attrs, coding_attr_iso_flags, flags);
10373       setup_iso_safe_charsets (attrs);
10374
10375       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10376         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10377                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10378                     ? coding_category_iso_7_else
10379                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10380                     ? coding_category_iso_7
10381                     : coding_category_iso_7_tight);
10382       else
10383         {
10384           int id = XINT (AREF (initial, 1));
10385
10386           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10387                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10388                        || id < 0)
10389                       ? coding_category_iso_8_else
10390                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10391                       ? coding_category_iso_8_1
10392                       : coding_category_iso_8_2);
10393         }
10394       if (category != coding_category_iso_8_1
10395           && category != coding_category_iso_8_2)
10396         ASET (attrs, coding_attr_ascii_compat, Qnil);
10397     }
10398   else if (EQ (coding_type, Qemacs_mule))
10399     {
10400       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10401         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10402       ASET (attrs, coding_attr_ascii_compat, Qt);
10403       category = coding_category_emacs_mule;
10404     }
10405   else if (EQ (coding_type, Qshift_jis))
10406     {
10407
10408       struct charset *charset;
10409
10410       if (XINT (Flength (charset_list)) != 3
10411           && XINT (Flength (charset_list)) != 4)
10412         error ("There should be three or four charsets");
10413
10414       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10415       if (CHARSET_DIMENSION (charset) != 1)
10416         error ("Dimension of charset %s is not one",
10417                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10418       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10419         ASET (attrs, coding_attr_ascii_compat, Qt);
10420
10421       charset_list = XCDR (charset_list);
10422       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10423       if (CHARSET_DIMENSION (charset) != 1)
10424         error ("Dimension of charset %s is not one",
10425                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10426
10427       charset_list = XCDR (charset_list);
10428       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10429       if (CHARSET_DIMENSION (charset) != 2)
10430         error ("Dimension of charset %s is not two",
10431                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10432
10433       charset_list = XCDR (charset_list);
10434       if (! NILP (charset_list))
10435         {
10436           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10437           if (CHARSET_DIMENSION (charset) != 2)
10438             error ("Dimension of charset %s is not two",
10439                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10440         }
10441
10442       category = coding_category_sjis;
10443       Vsjis_coding_system = name;
10444     }
10445   else if (EQ (coding_type, Qbig5))
10446     {
10447       struct charset *charset;
10448
10449       if (XINT (Flength (charset_list)) != 2)
10450         error ("There should be just two charsets");
10451
10452       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10453       if (CHARSET_DIMENSION (charset) != 1)
10454         error ("Dimension of charset %s is not one",
10455                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10456       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10457         ASET (attrs, coding_attr_ascii_compat, Qt);
10458
10459       charset_list = XCDR (charset_list);
10460       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10461       if (CHARSET_DIMENSION (charset) != 2)
10462         error ("Dimension of charset %s is not two",
10463                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10464
10465       category = coding_category_big5;
10466       Vbig5_coding_system = name;
10467     }
10468   else if (EQ (coding_type, Qraw_text))
10469     {
10470       category = coding_category_raw_text;
10471       ASET (attrs, coding_attr_ascii_compat, Qt);
10472     }
10473   else if (EQ (coding_type, Qutf_8))
10474     {
10475       Lisp_Object bom;
10476
10477       if (nargs < coding_arg_utf8_max)
10478         goto short_args;
10479
10480       bom = args[coding_arg_utf8_bom];
10481       if (! NILP (bom) && ! EQ (bom, Qt))
10482         {
10483           CHECK_CONS (bom);
10484           val = XCAR (bom);
10485           CHECK_CODING_SYSTEM (val);
10486           val = XCDR (bom);
10487           CHECK_CODING_SYSTEM (val);
10488         }
10489       ASET (attrs, coding_attr_utf_bom, bom);
10490       if (NILP (bom))
10491         ASET (attrs, coding_attr_ascii_compat, Qt);
10492
10493       category = (CONSP (bom) ? coding_category_utf_8_auto
10494                   : NILP (bom) ? coding_category_utf_8_nosig
10495                   : coding_category_utf_8_sig);
10496     }
10497   else if (EQ (coding_type, Qundecided))
10498     {
10499       if (nargs < coding_arg_undecided_max)
10500         goto short_args;
10501       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10502             args[coding_arg_undecided_inhibit_null_byte_detection]);
10503       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10504             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10505       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10506             args[coding_arg_undecided_prefer_utf_8]);
10507       category = coding_category_undecided;
10508     }
10509   else
10510     error ("Invalid coding system type: %s",
10511            SDATA (SYMBOL_NAME (coding_type)));
10512
10513   ASET (attrs, coding_attr_category, make_number (category));
10514   ASET (attrs, coding_attr_plist,
10515         Fcons (QCcategory,
10516                Fcons (AREF (Vcoding_category_table, category),
10517                       CODING_ATTR_PLIST (attrs))));
10518   ASET (attrs, coding_attr_plist,
10519         Fcons (QCascii_compatible_p,
10520                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10521                       CODING_ATTR_PLIST (attrs))));
10522
10523   eol_type = args[coding_arg_eol_type];
10524   if (! NILP (eol_type)
10525       && ! EQ (eol_type, Qunix)
10526       && ! EQ (eol_type, Qdos)
10527       && ! EQ (eol_type, Qmac))
10528     error ("Invalid eol-type");
10529
10530   aliases = list1 (name);
10531
10532   if (NILP (eol_type))
10533     {
10534       eol_type = make_subsidiaries (name);
10535       for (i = 0; i < 3; i++)
10536         {
10537           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10538
10539           this_name = AREF (eol_type, i);
10540           this_aliases = list1 (this_name);
10541           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10542           this_spec = make_uninit_vector (3);
10543           ASET (this_spec, 0, attrs);
10544           ASET (this_spec, 1, this_aliases);
10545           ASET (this_spec, 2, this_eol_type);
10546           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10547           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10548           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10549           if (NILP (val))
10550             Vcoding_system_alist
10551               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10552                        Vcoding_system_alist);
10553         }
10554     }
10555
10556   spec_vec = make_uninit_vector (3);
10557   ASET (spec_vec, 0, attrs);
10558   ASET (spec_vec, 1, aliases);
10559   ASET (spec_vec, 2, eol_type);
10560
10561   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10562   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10563   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10564   if (NILP (val))
10565     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10566                                   Vcoding_system_alist);
10567
10568   {
10569     int id = coding_categories[category].id;
10570
10571     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10572       setup_coding_system (name, &coding_categories[category]);
10573   }
10574
10575   return Qnil;
10576
10577  short_args:
10578   return Fsignal (Qwrong_number_of_arguments,
10579                   Fcons (intern ("define-coding-system-internal"),
10580                          make_number (nargs)));
10581 }
10582
10583
10584 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10585        3, 3, 0,
10586        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10587   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10588 {
10589   Lisp_Object spec, attrs;
10590
10591   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10592   attrs = AREF (spec, 0);
10593   if (EQ (prop, QCmnemonic))
10594     {
10595       if (! STRINGP (val))
10596         CHECK_CHARACTER (val);
10597       ASET (attrs, coding_attr_mnemonic, val);
10598     }
10599   else if (EQ (prop, QCdefault_char))
10600     {
10601       if (NILP (val))
10602         val = make_number (' ');
10603       else
10604         CHECK_CHARACTER (val);
10605       ASET (attrs, coding_attr_default_char, val);
10606     }
10607   else if (EQ (prop, QCdecode_translation_table))
10608     {
10609       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10610         CHECK_SYMBOL (val);
10611       ASET (attrs, coding_attr_decode_tbl, val);
10612     }
10613   else if (EQ (prop, QCencode_translation_table))
10614     {
10615       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10616         CHECK_SYMBOL (val);
10617       ASET (attrs, coding_attr_encode_tbl, val);
10618     }
10619   else if (EQ (prop, QCpost_read_conversion))
10620     {
10621       CHECK_SYMBOL (val);
10622       ASET (attrs, coding_attr_post_read, val);
10623     }
10624   else if (EQ (prop, QCpre_write_conversion))
10625     {
10626       CHECK_SYMBOL (val);
10627       ASET (attrs, coding_attr_pre_write, val);
10628     }
10629   else if (EQ (prop, QCascii_compatible_p))
10630     {
10631       ASET (attrs, coding_attr_ascii_compat, val);
10632     }
10633
10634   ASET (attrs, coding_attr_plist,
10635         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10636   return val;
10637 }
10638
10639
10640 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10641        Sdefine_coding_system_alias, 2, 2, 0,
10642        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10643   (Lisp_Object alias, Lisp_Object coding_system)
10644 {
10645   Lisp_Object spec, aliases, eol_type, val;
10646
10647   CHECK_SYMBOL (alias);
10648   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10649   aliases = AREF (spec, 1);
10650   /* ALIASES should be a list of length more than zero, and the first
10651      element is a base coding system.  Append ALIAS at the tail of the
10652      list.  */
10653   while (!NILP (XCDR (aliases)))
10654     aliases = XCDR (aliases);
10655   XSETCDR (aliases, list1 (alias));
10656
10657   eol_type = AREF (spec, 2);
10658   if (VECTORP (eol_type))
10659     {
10660       Lisp_Object subsidiaries;
10661       int i;
10662
10663       subsidiaries = make_subsidiaries (alias);
10664       for (i = 0; i < 3; i++)
10665         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10666                                      AREF (eol_type, i));
10667     }
10668
10669   Fputhash (alias, spec, Vcoding_system_hash_table);
10670   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10671   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10672   if (NILP (val))
10673     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10674                                   Vcoding_system_alist);
10675
10676   return Qnil;
10677 }
10678
10679 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10680        1, 1, 0,
10681        doc: /* Return the base of CODING-SYSTEM.
10682 Any alias or subsidiary coding system is not a base coding system.  */)
10683   (Lisp_Object coding_system)
10684 {
10685   Lisp_Object spec, attrs;
10686
10687   if (NILP (coding_system))
10688     return (Qno_conversion);
10689   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10690   attrs = AREF (spec, 0);
10691   return CODING_ATTR_BASE_NAME (attrs);
10692 }
10693
10694 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10695        1, 1, 0,
10696        doc: "Return the property list of CODING-SYSTEM.")
10697   (Lisp_Object coding_system)
10698 {
10699   Lisp_Object spec, attrs;
10700
10701   if (NILP (coding_system))
10702     coding_system = Qno_conversion;
10703   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10704   attrs = AREF (spec, 0);
10705   return CODING_ATTR_PLIST (attrs);
10706 }
10707
10708
10709 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10710        1, 1, 0,
10711        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10712   (Lisp_Object coding_system)
10713 {
10714   Lisp_Object spec;
10715
10716   if (NILP (coding_system))
10717     coding_system = Qno_conversion;
10718   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10719   return AREF (spec, 1);
10720 }
10721
10722 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10723        Scoding_system_eol_type, 1, 1, 0,
10724        doc: /* Return eol-type of CODING-SYSTEM.
10725 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10726
10727 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10728 and CR respectively.
10729
10730 A vector value indicates that a format of end-of-line should be
10731 detected automatically.  Nth element of the vector is the subsidiary
10732 coding system whose eol-type is N.  */)
10733   (Lisp_Object coding_system)
10734 {
10735   Lisp_Object spec, eol_type;
10736   int n;
10737
10738   if (NILP (coding_system))
10739     coding_system = Qno_conversion;
10740   if (! CODING_SYSTEM_P (coding_system))
10741     return Qnil;
10742   spec = CODING_SYSTEM_SPEC (coding_system);
10743   eol_type = AREF (spec, 2);
10744   if (VECTORP (eol_type))
10745     return Fcopy_sequence (eol_type);
10746   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10747   return make_number (n);
10748 }
10749
10750 #endif /* emacs */
10751
10752 \f
10753 /*** 9. Post-amble ***/
10754
10755 void
10756 init_coding_once (void)
10757 {
10758   int i;
10759
10760   for (i = 0; i < coding_category_max; i++)
10761     {
10762       coding_categories[i].id = -1;
10763       coding_priorities[i] = i;
10764     }
10765
10766   /* ISO2022 specific initialize routine.  */
10767   for (i = 0; i < 0x20; i++)
10768     iso_code_class[i] = ISO_control_0;
10769   for (i = 0x21; i < 0x7F; i++)
10770     iso_code_class[i] = ISO_graphic_plane_0;
10771   for (i = 0x80; i < 0xA0; i++)
10772     iso_code_class[i] = ISO_control_1;
10773   for (i = 0xA1; i < 0xFF; i++)
10774     iso_code_class[i] = ISO_graphic_plane_1;
10775   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10776   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10777   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10778   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10779   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10780   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10781   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10782   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10783   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10784
10785   for (i = 0; i < 256; i++)
10786     {
10787       emacs_mule_bytes[i] = 1;
10788     }
10789   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10790   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10791   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10792   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10793 }
10794
10795 #ifdef emacs
10796
10797 void
10798 syms_of_coding (void)
10799 {
10800 #include "coding.x"
10801
10802   staticpro (&Vcoding_system_hash_table);
10803   {
10804     Lisp_Object args[2];
10805     args[0] = QCtest;
10806     args[1] = Qeq;
10807     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10808   }
10809
10810   staticpro (&Vsjis_coding_system);
10811   Vsjis_coding_system = Qnil;
10812
10813   staticpro (&Vbig5_coding_system);
10814   Vbig5_coding_system = Qnil;
10815
10816   staticpro (&Vcode_conversion_reused_workbuf);
10817   Vcode_conversion_reused_workbuf = Qnil;
10818
10819   staticpro (&Vcode_conversion_workbuf_name);
10820   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10821
10822   reused_workbuf_in_use = 0;
10823
10824   DEFSYM (Qcharset, "charset");
10825   DEFSYM (Qtarget_idx, "target-idx");
10826   DEFSYM (Qcoding_system_history, "coding-system-history");
10827   Fset (Qcoding_system_history, Qnil);
10828
10829   /* Target FILENAME is the first argument.  */
10830   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10831   /* Target FILENAME is the third argument.  */
10832   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10833
10834   DEFSYM (Qcall_process, "call-process");
10835   /* Target PROGRAM is the first argument.  */
10836   Fput (Qcall_process, Qtarget_idx, make_number (0));
10837
10838   DEFSYM (Qcall_process_region, "call-process-region");
10839   /* Target PROGRAM is the third argument.  */
10840   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10841
10842   DEFSYM (Qstart_process, "start-process");
10843   /* Target PROGRAM is the third argument.  */
10844   Fput (Qstart_process, Qtarget_idx, make_number (2));
10845
10846   DEFSYM (Qopen_network_stream, "open-network-stream");
10847   /* Target SERVICE is the fourth argument.  */
10848   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10849
10850   DEFSYM (Qcoding_system, "coding-system");
10851   DEFSYM (Qcoding_aliases, "coding-aliases");
10852
10853   DEFSYM (Qeol_type, "eol-type");
10854   DEFSYM (Qunix, "unix");
10855   DEFSYM (Qdos, "dos");
10856   DEFSYM (Qmac, "mac");
10857
10858   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10859   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10860   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10861   DEFSYM (Qdefault_char, "default-char");
10862   DEFSYM (Qundecided, "undecided");
10863   DEFSYM (Qno_conversion, "no-conversion");
10864   DEFSYM (Qraw_text, "raw-text");
10865
10866   DEFSYM (Qiso_2022, "iso-2022");
10867
10868   DEFSYM (Qutf_8, "utf-8");
10869   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10870
10871 #if defined (WINDOWSNT) || defined (CYGWIN)
10872   /* No, not utf-16-le: that one has a BOM.  */
10873   DEFSYM (Qutf_16le, "utf-16le");
10874 #endif
10875
10876   DEFSYM (Qutf_16, "utf-16");
10877   DEFSYM (Qbig, "big");
10878   DEFSYM (Qlittle, "little");
10879
10880   DEFSYM (Qshift_jis, "shift-jis");
10881   DEFSYM (Qbig5, "big5");
10882
10883   DEFSYM (Qcoding_system_p, "coding-system-p");
10884
10885   DEFSYM (Qcoding_system_error, "coding-system-error");
10886   Fput (Qcoding_system_error, Qerror_conditions,
10887         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10888   Fput (Qcoding_system_error, Qerror_message,
10889         build_pure_c_string ("Invalid coding system"));
10890
10891   DEFSYM (Qtranslation_table, "translation-table");
10892   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10893   DEFSYM (Qtranslation_table_id, "translation-table-id");
10894   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10895   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10896
10897   DEFSYM (Qvalid_codes, "valid-codes");
10898
10899   DEFSYM (Qemacs_mule, "emacs-mule");
10900
10901   DEFSYM (QCcategory, ":category");
10902   DEFSYM (QCmnemonic, ":mnemonic");
10903   DEFSYM (QCdefault_char, ":default-char");
10904   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10905   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10906   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10907   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10908   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10909
10910   Vcoding_category_table
10911     = Fmake_vector (make_number (coding_category_max), Qnil);
10912   staticpro (&Vcoding_category_table);
10913   /* Followings are target of code detection.  */
10914   ASET (Vcoding_category_table, coding_category_iso_7,
10915         intern_c_string ("coding-category-iso-7"));
10916   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10917         intern_c_string ("coding-category-iso-7-tight"));
10918   ASET (Vcoding_category_table, coding_category_iso_8_1,
10919         intern_c_string ("coding-category-iso-8-1"));
10920   ASET (Vcoding_category_table, coding_category_iso_8_2,
10921         intern_c_string ("coding-category-iso-8-2"));
10922   ASET (Vcoding_category_table, coding_category_iso_7_else,
10923         intern_c_string ("coding-category-iso-7-else"));
10924   ASET (Vcoding_category_table, coding_category_iso_8_else,
10925         intern_c_string ("coding-category-iso-8-else"));
10926   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10927         intern_c_string ("coding-category-utf-8-auto"));
10928   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10929         intern_c_string ("coding-category-utf-8"));
10930   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10931         intern_c_string ("coding-category-utf-8-sig"));
10932   ASET (Vcoding_category_table, coding_category_utf_16_be,
10933         intern_c_string ("coding-category-utf-16-be"));
10934   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10935         intern_c_string ("coding-category-utf-16-auto"));
10936   ASET (Vcoding_category_table, coding_category_utf_16_le,
10937         intern_c_string ("coding-category-utf-16-le"));
10938   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10939         intern_c_string ("coding-category-utf-16-be-nosig"));
10940   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10941         intern_c_string ("coding-category-utf-16-le-nosig"));
10942   ASET (Vcoding_category_table, coding_category_charset,
10943         intern_c_string ("coding-category-charset"));
10944   ASET (Vcoding_category_table, coding_category_sjis,
10945         intern_c_string ("coding-category-sjis"));
10946   ASET (Vcoding_category_table, coding_category_big5,
10947         intern_c_string ("coding-category-big5"));
10948   ASET (Vcoding_category_table, coding_category_ccl,
10949         intern_c_string ("coding-category-ccl"));
10950   ASET (Vcoding_category_table, coding_category_emacs_mule,
10951         intern_c_string ("coding-category-emacs-mule"));
10952   /* Followings are NOT target of code detection.  */
10953   ASET (Vcoding_category_table, coding_category_raw_text,
10954         intern_c_string ("coding-category-raw-text"));
10955   ASET (Vcoding_category_table, coding_category_undecided,
10956         intern_c_string ("coding-category-undecided"));
10957
10958   DEFSYM (Qinsufficient_source, "insufficient-source");
10959   DEFSYM (Qinvalid_source, "invalid-source");
10960   DEFSYM (Qinterrupted, "interrupted");
10961   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10962
10963   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10964                doc: /* List of coding systems.
10965
10966 Do not alter the value of this variable manually.  This variable should be
10967 updated by the functions `define-coding-system' and
10968 `define-coding-system-alias'.  */);
10969   Vcoding_system_list = Qnil;
10970
10971   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10972                doc: /* Alist of coding system names.
10973 Each element is one element list of coding system name.
10974 This variable is given to `completing-read' as COLLECTION argument.
10975
10976 Do not alter the value of this variable manually.  This variable should be
10977 updated by the functions `make-coding-system' and
10978 `define-coding-system-alias'.  */);
10979   Vcoding_system_alist = Qnil;
10980
10981   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10982                doc: /* List of coding-categories (symbols) ordered by priority.
10983
10984 On detecting a coding system, Emacs tries code detection algorithms
10985 associated with each coding-category one by one in this order.  When
10986 one algorithm agrees with a byte sequence of source text, the coding
10987 system bound to the corresponding coding-category is selected.
10988
10989 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10990   {
10991     int i;
10992
10993     Vcoding_category_list = Qnil;
10994     for (i = coding_category_max - 1; i >= 0; i--)
10995       Vcoding_category_list
10996         = Fcons (AREF (Vcoding_category_table, i),
10997                  Vcoding_category_list);
10998   }
10999
11000   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11001                doc: /* Specify the coding system for read operations.
11002 It is useful to bind this variable with `let', but do not set it globally.
11003 If the value is a coding system, it is used for decoding on read operation.
11004 If not, an appropriate element is used from one of the coding system alists.
11005 There are three such tables: `file-coding-system-alist',
11006 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11007   Vcoding_system_for_read = Qnil;
11008
11009   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11010                doc: /* Specify the coding system for write operations.
11011 Programs bind this variable with `let', but you should not set it globally.
11012 If the value is a coding system, it is used for encoding of output,
11013 when writing it to a file and when sending it to a file or subprocess.
11014
11015 If this does not specify a coding system, an appropriate element
11016 is used from one of the coding system alists.
11017 There are three such tables: `file-coding-system-alist',
11018 `process-coding-system-alist', and `network-coding-system-alist'.
11019 For output to files, if the above procedure does not specify a coding system,
11020 the value of `buffer-file-coding-system' is used.  */);
11021   Vcoding_system_for_write = Qnil;
11022
11023   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11024                doc: /*
11025 Coding system used in the latest file or process I/O.  */);
11026   Vlast_coding_system_used = Qnil;
11027
11028   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11029                doc: /*
11030 Error status of the last code conversion.
11031
11032 When an error was detected in the last code conversion, this variable
11033 is set to one of the following symbols.
11034   `insufficient-source'
11035   `inconsistent-eol'
11036   `invalid-source'
11037   `interrupted'
11038   `insufficient-memory'
11039 When no error was detected, the value doesn't change.  So, to check
11040 the error status of a code conversion by this variable, you must
11041 explicitly set this variable to nil before performing code
11042 conversion.  */);
11043   Vlast_code_conversion_error = Qnil;
11044
11045   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11046                doc: /*
11047 *Non-nil means always inhibit code conversion of end-of-line format.
11048 See info node `Coding Systems' and info node `Text and Binary' concerning
11049 such conversion.  */);
11050   inhibit_eol_conversion = 0;
11051
11052   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11053                doc: /*
11054 Non-nil means process buffer inherits coding system of process output.
11055 Bind it to t if the process output is to be treated as if it were a file
11056 read from some filesystem.  */);
11057   inherit_process_coding_system = 0;
11058
11059   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11060                doc: /*
11061 Alist to decide a coding system to use for a file I/O operation.
11062 The format is ((PATTERN . VAL) ...),
11063 where PATTERN is a regular expression matching a file name,
11064 VAL is a coding system, a cons of coding systems, or a function symbol.
11065 If VAL is a coding system, it is used for both decoding and encoding
11066 the file contents.
11067 If VAL is a cons of coding systems, the car part is used for decoding,
11068 and the cdr part is used for encoding.
11069 If VAL is a function symbol, the function must return a coding system
11070 or a cons of coding systems which are used as above.  The function is
11071 called with an argument that is a list of the arguments with which
11072 `find-operation-coding-system' was called.  If the function can't decide
11073 a coding system, it can return `undecided' so that the normal
11074 code-detection is performed.
11075
11076 See also the function `find-operation-coding-system'
11077 and the variable `auto-coding-alist'.  */);
11078   Vfile_coding_system_alist = Qnil;
11079
11080   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11081                doc: /*
11082 Alist to decide a coding system to use for a process I/O operation.
11083 The format is ((PATTERN . VAL) ...),
11084 where PATTERN is a regular expression matching a program name,
11085 VAL is a coding system, a cons of coding systems, or a function symbol.
11086 If VAL is a coding system, it is used for both decoding what received
11087 from the program and encoding what sent to the program.
11088 If VAL is a cons of coding systems, the car part is used for decoding,
11089 and the cdr part is used for encoding.
11090 If VAL is a function symbol, the function must return a coding system
11091 or a cons of coding systems which are used as above.
11092
11093 See also the function `find-operation-coding-system'.  */);
11094   Vprocess_coding_system_alist = Qnil;
11095
11096   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11097                doc: /*
11098 Alist to decide a coding system to use for a network I/O operation.
11099 The format is ((PATTERN . VAL) ...),
11100 where PATTERN is a regular expression matching a network service name
11101 or is a port number to connect to,
11102 VAL is a coding system, a cons of coding systems, or a function symbol.
11103 If VAL is a coding system, it is used for both decoding what received
11104 from the network stream and encoding what sent to the network stream.
11105 If VAL is a cons of coding systems, the car part is used for decoding,
11106 and the cdr part is used for encoding.
11107 If VAL is a function symbol, the function must return a coding system
11108 or a cons of coding systems which are used as above.
11109
11110 See also the function `find-operation-coding-system'.  */);
11111   Vnetwork_coding_system_alist = Qnil;
11112
11113   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11114                doc: /* Coding system to use with system messages.
11115 Also used for decoding keyboard input on X Window system.  */);
11116   Vlocale_coding_system = Qnil;
11117
11118   /* The eol mnemonics are reset in startup.el system-dependently.  */
11119   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11120                doc: /*
11121 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11122   eol_mnemonic_unix = build_pure_c_string (":");
11123
11124   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11125                doc: /*
11126 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11127   eol_mnemonic_dos = build_pure_c_string ("\\");
11128
11129   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11130                doc: /*
11131 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11132   eol_mnemonic_mac = build_pure_c_string ("/");
11133
11134   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11135                doc: /*
11136 *String displayed in mode line when end-of-line format is not yet determined.  */);
11137   eol_mnemonic_undecided = build_pure_c_string (":");
11138
11139   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11140                doc: /*
11141 *Non-nil enables character translation while encoding and decoding.  */);
11142   Venable_character_translation = Qt;
11143
11144   DEFVAR_LISP ("standard-translation-table-for-decode",
11145                Vstandard_translation_table_for_decode,
11146                doc: /* Table for translating characters while decoding.  */);
11147   Vstandard_translation_table_for_decode = Qnil;
11148
11149   DEFVAR_LISP ("standard-translation-table-for-encode",
11150                Vstandard_translation_table_for_encode,
11151                doc: /* Table for translating characters while encoding.  */);
11152   Vstandard_translation_table_for_encode = Qnil;
11153
11154   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11155                doc: /* Alist of charsets vs revision numbers.
11156 While encoding, if a charset (car part of an element) is found,
11157 designate it with the escape sequence identifying revision (cdr part
11158 of the element).  */);
11159   Vcharset_revision_table = Qnil;
11160
11161   DEFVAR_LISP ("default-process-coding-system",
11162                Vdefault_process_coding_system,
11163                doc: /* Cons of coding systems used for process I/O by default.
11164 The car part is used for decoding a process output,
11165 the cdr part is used for encoding a text to be sent to a process.  */);
11166   Vdefault_process_coding_system = Qnil;
11167
11168   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11169                doc: /*
11170 Table of extra Latin codes in the range 128..159 (inclusive).
11171 This is a vector of length 256.
11172 If Nth element is non-nil, the existence of code N in a file
11173 \(or output of subprocess) doesn't prevent it to be detected as
11174 a coding system of ISO 2022 variant which has a flag
11175 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11176 or reading output of a subprocess.
11177 Only 128th through 159th elements have a meaning.  */);
11178   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11179
11180   DEFVAR_LISP ("select-safe-coding-system-function",
11181                Vselect_safe_coding_system_function,
11182                doc: /*
11183 Function to call to select safe coding system for encoding a text.
11184
11185 If set, this function is called to force a user to select a proper
11186 coding system which can encode the text in the case that a default
11187 coding system used in each operation can't encode the text.  The
11188 function should take care that the buffer is not modified while
11189 the coding system is being selected.
11190
11191 The default value is `select-safe-coding-system' (which see).  */);
11192   Vselect_safe_coding_system_function = Qnil;
11193
11194   DEFVAR_BOOL ("coding-system-require-warning",
11195                coding_system_require_warning,
11196                doc: /* Internal use only.
11197 If non-nil, on writing a file, `select-safe-coding-system-function' is
11198 called even if `coding-system-for-write' is non-nil.  The command
11199 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11200   coding_system_require_warning = 0;
11201
11202
11203   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11204                inhibit_iso_escape_detection,
11205                doc: /*
11206 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11207
11208 When Emacs reads text, it tries to detect how the text is encoded.
11209 This code detection is sensitive to escape sequences.  If Emacs sees
11210 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11211 of the ISO2022 encodings, and decodes text by the corresponding coding
11212 system (e.g. `iso-2022-7bit').
11213
11214 However, there may be a case that you want to read escape sequences in
11215 a file as is.  In such a case, you can set this variable to non-nil.
11216 Then the code detection will ignore any escape sequences, and no text is
11217 detected as encoded in some ISO-2022 encoding.  The result is that all
11218 escape sequences become visible in a buffer.
11219
11220 The default value is nil, and it is strongly recommended not to change
11221 it.  That is because many Emacs Lisp source files that contain
11222 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11223 in Emacs's distribution, and they won't be decoded correctly on
11224 reading if you suppress escape sequence detection.
11225
11226 The other way to read escape sequences in a file without decoding is
11227 to explicitly specify some coding system that doesn't use ISO-2022
11228 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11229   inhibit_iso_escape_detection = 0;
11230
11231   DEFVAR_BOOL ("inhibit-null-byte-detection",
11232                inhibit_null_byte_detection,
11233                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11234 By default, Emacs treats it as binary data, and does not attempt to
11235 decode it.  The effect is as if you specified `no-conversion' for
11236 reading that text.
11237
11238 Set this to non-nil when a regular text happens to include null bytes.
11239 Examples are Index nodes of Info files and null-byte delimited output
11240 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11241 decode text as usual.  */);
11242   inhibit_null_byte_detection = 0;
11243
11244   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11245                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11246 Internal use only.  Removed after the experimental optimizer gets stable. */);
11247   disable_ascii_optimization = 0;
11248
11249   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11250                doc: /* Char table for translating self-inserting characters.
11251 This is applied to the result of input methods, not their input.
11252 See also `keyboard-translate-table'.
11253
11254 Use of this variable for character code unification was rendered
11255 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11256 internal character representation.  */);
11257     Vtranslation_table_for_input = Qnil;
11258
11259   {
11260     Lisp_Object args[coding_arg_undecided_max];
11261     Lisp_Object plist[16];
11262     int i;
11263
11264     for (i = 0; i < coding_arg_undecided_max; i++)
11265       args[i] = Qnil;
11266
11267     plist[0] = intern_c_string (":name");
11268     plist[1] = args[coding_arg_name] = Qno_conversion;
11269     plist[2] = intern_c_string (":mnemonic");
11270     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11271     plist[4] = intern_c_string (":coding-type");
11272     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11273     plist[6] = intern_c_string (":ascii-compatible-p");
11274     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11275     plist[8] = intern_c_string (":default-char");
11276     plist[9] = args[coding_arg_default_char] = make_number (0);
11277     plist[10] = intern_c_string (":for-unibyte");
11278     plist[11] = args[coding_arg_for_unibyte] = Qt;
11279     plist[12] = intern_c_string (":docstring");
11280     plist[13] = build_pure_c_string ("Do no conversion.\n\
11281 \n\
11282 When you visit a file with this coding, the file is read into a\n\
11283 unibyte buffer as is, thus each byte of a file is treated as a\n\
11284 character.");
11285     plist[14] = intern_c_string (":eol-type");
11286     plist[15] = args[coding_arg_eol_type] = Qunix;
11287     args[coding_arg_plist] = Flist (16, plist);
11288     Fdefine_coding_system_internal (coding_arg_max, args);
11289
11290     plist[1] = args[coding_arg_name] = Qundecided;
11291     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11292     plist[5] = args[coding_arg_coding_type] = Qundecided;
11293     /* This is already set.
11294        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11295     plist[8] = intern_c_string (":charset-list");
11296     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11297     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11298     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11299     plist[15] = args[coding_arg_eol_type] = Qnil;
11300     args[coding_arg_plist] = Flist (16, plist);
11301     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11302     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11303     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11304   }
11305
11306   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11307
11308   {
11309     int i;
11310
11311     for (i = 0; i < coding_category_max; i++)
11312       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11313   }
11314 #if defined (DOS_NT)
11315   system_eol_type = Qdos;
11316 #else
11317   system_eol_type = Qunix;
11318 #endif
11319   staticpro (&system_eol_type);
11320 }
11321
11322 char *
11323 emacs_strerror (int error_number)
11324 {
11325   char *str;
11326
11327   synchronize_system_messages_locale ();
11328   str = strerror (error_number);
11329
11330   if (! NILP (Vlocale_coding_system))
11331     {
11332       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11333                                                       Vlocale_coding_system,
11334                                                       0);
11335       str = SSDATA (dec);
11336     }
11337
11338   return str;
11339 }
11340
11341 #endif /* emacs */