src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 static Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qutf_8;
 310 static Lisp_Object Qiso_2022;
 311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 312 static Lisp_Object Qbig, Qlittle;
 313 static Lisp_Object Qcoding_system_history;
 314 static Lisp_Object Qvalid_codes;
 315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 318 static Lisp_Object QCascii_compatible_p;
 319
 320 Lisp_Object Qcall_process, Qcall_process_region;
 321 Lisp_Object Qstart_process, Qopen_network_stream;
 322 static Lisp_Object Qtarget_idx;
 323
 324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 326
 327 /* If a symbol has this property, evaluate the value to define the
 328    symbol as a coding system.  */
 329 static Lisp_Object Qcoding_system_define_form;
 330
 331 /* Format of end-of-line decided by system.  This is Qunix on
 332    Unix and Mac, Qdos on DOS/Windows.
 333    This has an effect only for external encoding (i.e. for output to
 334    file and process), not for in-buffer or Lisp string encoding.  */
 335 static Lisp_Object system_eol_type;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 340
 341 /* Coding system emacs-mule and raw-text are for converting only
 342    end-of-line format.  */
 343 Lisp_Object Qemacs_mule, Qraw_text;
 344 Lisp_Object Qutf_8_emacs;
 345
 346 #if defined (WINDOWSNT) || defined (HAVE_NTGUI)
 347 static Lisp_Object Qutf_16le;
 348 #endif
 349
 350 /* Coding-systems are handed between Emacs Lisp programs and C internal
 351    routines by the following three variables.  */
 352 /* Coding system to be used to encode text for terminal display when
 353    terminal coding system is nil.  */
 354 struct coding_system safe_terminal_coding;
 355
 356 #endif /* emacs */
 357
 358 Lisp_Object Qtranslation_table;
 359 Lisp_Object Qtranslation_table_id;
 360 static Lisp_Object Qtranslation_table_for_decode;
 361 static Lisp_Object Qtranslation_table_for_encode;
 362
 363 /* Two special coding systems.  */
 364 static Lisp_Object Vsjis_coding_system;
 365 static Lisp_Object Vbig5_coding_system;
 366
 367 /* ISO2022 section */
 368
 369 #define CODING_ISO_INITIAL(coding, reg)                 \
 370   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 371                      coding_attr_iso_initial),          \
 372                reg)))
 373
 374
 375 #define CODING_ISO_REQUEST(coding, charset_id)          \
 376   (((charset_id) <= (coding)->max_charset_id            \
 377     ? ((coding)->safe_charsets[charset_id] != 255       \
 378        ? (coding)->safe_charsets[charset_id]            \
 379        : -1)                                            \
 380     : -1))
 381
 382
 383 #define CODING_ISO_FLAGS(coding)        \
 384   ((coding)->spec.iso_2022.flags)
 385 #define CODING_ISO_DESIGNATION(coding, reg)     \
 386   ((coding)->spec.iso_2022.current_designation[reg])
 387 #define CODING_ISO_INVOCATION(coding, plane)    \
 388   ((coding)->spec.iso_2022.current_invocation[plane])
 389 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 390   ((coding)->spec.iso_2022.single_shifting)
 391 #define CODING_ISO_BOL(coding)  \
 392   ((coding)->spec.iso_2022.bol)
 393 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 394   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 395 #define CODING_ISO_CMP_STATUS(coding)   \
 396   (&(coding)->spec.iso_2022.cmp_status)
 397 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 398   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 399 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 400   ((coding)->spec.iso_2022.embedded_utf_8)
 401
 402 /* Control characters of ISO2022.  */
 403                         /* code */      /* function */
 404 #define ISO_CODE_SO     0x0E            /* shift-out */
 405 #define ISO_CODE_SI     0x0F            /* shift-in */
 406 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 407 #define ISO_CODE_ESC    0x1B            /* escape */
 408 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 409 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 410 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 411
 412 /* All code (1-byte) of ISO2022 is classified into one of the
 413    followings.  */
 414 enum iso_code_class_type
 415   {
 416     ISO_control_0,              /* Control codes in the range
 417                                    0x00..0x1F and 0x7F, except for the
 418                                    following 5 codes.  */
 419     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 420     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 421     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 422     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 423     ISO_control_1,              /* Control codes in the range
 424                                    0x80..0x9F, except for the
 425                                    following 3 codes.  */
 426     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 427     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 428     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 429     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 430     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 431     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 432     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 433   };
 434
 435 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 436     `iso-flags' attribute of an iso2022 coding system.  */
 437
 438 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 439    instead of the correct short-form sequence (e.g. ESC $ A).  */
 440 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 441
 442 /* If set, reset graphic planes and registers at end-of-line to the
 443    initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 445
 446 /* If set, reset graphic planes and registers before any control
 447    characters to the initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 449
 450 /* If set, encode by 7-bit environment.  */
 451 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 452
 453 /* If set, use locking-shift function.  */
 454 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 455
 456 /* If set, use single-shift function.  Overwrite
 457    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 458 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 459
 460 /* If set, use designation escape sequence.  */
 461 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 462
 463 /* If set, produce revision number sequence.  */
 464 #define CODING_ISO_FLAG_REVISION        0x0080
 465
 466 /* If set, produce ISO6429's direction specifying sequence.  */
 467 #define CODING_ISO_FLAG_DIRECTION       0x0100
 468
 469 /* If set, assume designation states are reset at beginning of line on
 470    output.  */
 471 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 472
 473 /* If set, designation sequence should be placed at beginning of line
 474    on output.  */
 475 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 476
 477 /* If set, do not encode unsafe characters on output.  */
 478 #define CODING_ISO_FLAG_SAFE            0x0800
 479
 480 /* If set, extra latin codes (128..159) are accepted as a valid code
 481    on input.  */
 482 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 483
 484 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 485
 486 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 487
 488 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 489
 490 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 491
 492 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 493
 494 /* A character to be produced on output if encoding of the original
 495    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 496 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 497
 498 /* UTF-8 section */
 499 #define CODING_UTF_8_BOM(coding)        \
 500   ((coding)->spec.utf_8_bom)
 501
 502 /* UTF-16 section */
 503 #define CODING_UTF_16_BOM(coding)       \
 504   ((coding)->spec.utf_16.bom)
 505
 506 #define CODING_UTF_16_ENDIAN(coding)    \
 507   ((coding)->spec.utf_16.endian)
 508
 509 #define CODING_UTF_16_SURROGATE(coding) \
 510   ((coding)->spec.utf_16.surrogate)
 511
 512
 513 /* CCL section */
 514 #define CODING_CCL_DECODER(coding)      \
 515   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 516 #define CODING_CCL_ENCODER(coding)      \
 517   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 518 #define CODING_CCL_VALIDS(coding)                                          \
 519   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 520
 521 /* Index for each coding category in `coding_categories' */
 522
 523 enum coding_category
 524   {
 525     coding_category_iso_7,
 526     coding_category_iso_7_tight,
 527     coding_category_iso_8_1,
 528     coding_category_iso_8_2,
 529     coding_category_iso_7_else,
 530     coding_category_iso_8_else,
 531     coding_category_utf_8_auto,
 532     coding_category_utf_8_nosig,
 533     coding_category_utf_8_sig,
 534     coding_category_utf_16_auto,
 535     coding_category_utf_16_be,
 536     coding_category_utf_16_le,
 537     coding_category_utf_16_be_nosig,
 538     coding_category_utf_16_le_nosig,
 539     coding_category_charset,
 540     coding_category_sjis,
 541     coding_category_big5,
 542     coding_category_ccl,
 543     coding_category_emacs_mule,
 544     /* All above are targets of code detection.  */
 545     coding_category_raw_text,
 546     coding_category_undecided,
 547     coding_category_max
 548   };
 549
 550 /* Definitions of flag bits used in detect_coding_XXXX.  */
 551 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 552 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 553 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 554 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 555 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 556 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 557 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 558 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 559 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 560 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 561 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 562 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 563 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 564 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 565 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 566 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 567 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 568 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 569 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 570 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 571
 572 /* This value is returned if detect_coding_mask () find nothing other
 573    than ASCII characters.  */
 574 #define CATEGORY_MASK_ANY               \
 575   (CATEGORY_MASK_ISO_7                  \
 576    | CATEGORY_MASK_ISO_7_TIGHT          \
 577    | CATEGORY_MASK_ISO_8_1              \
 578    | CATEGORY_MASK_ISO_8_2              \
 579    | CATEGORY_MASK_ISO_7_ELSE           \
 580    | CATEGORY_MASK_ISO_8_ELSE           \
 581    | CATEGORY_MASK_UTF_8_AUTO           \
 582    | CATEGORY_MASK_UTF_8_NOSIG          \
 583    | CATEGORY_MASK_UTF_8_SIG            \
 584    | CATEGORY_MASK_UTF_16_AUTO          \
 585    | CATEGORY_MASK_UTF_16_BE            \
 586    | CATEGORY_MASK_UTF_16_LE            \
 587    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 588    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 589    | CATEGORY_MASK_CHARSET              \
 590    | CATEGORY_MASK_SJIS                 \
 591    | CATEGORY_MASK_BIG5                 \
 592    | CATEGORY_MASK_CCL                  \
 593    | CATEGORY_MASK_EMACS_MULE)
 594
 595
 596 #define CATEGORY_MASK_ISO_7BIT \
 597   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 598
 599 #define CATEGORY_MASK_ISO_8BIT \
 600   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 601
 602 #define CATEGORY_MASK_ISO_ELSE \
 603   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 604
 605 #define CATEGORY_MASK_ISO_ESCAPE        \
 606   (CATEGORY_MASK_ISO_7                  \
 607    | CATEGORY_MASK_ISO_7_TIGHT          \
 608    | CATEGORY_MASK_ISO_7_ELSE           \
 609    | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO       \
 612   (  CATEGORY_MASK_ISO_7BIT     \
 613      | CATEGORY_MASK_ISO_8BIT   \
 614      | CATEGORY_MASK_ISO_ELSE)
 615
 616 #define CATEGORY_MASK_UTF_16            \
 617   (CATEGORY_MASK_UTF_16_AUTO            \
 618    | CATEGORY_MASK_UTF_16_BE            \
 619    | CATEGORY_MASK_UTF_16_LE            \
 620    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 621    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 622
 623 #define CATEGORY_MASK_UTF_8     \
 624   (CATEGORY_MASK_UTF_8_AUTO     \
 625    | CATEGORY_MASK_UTF_8_NOSIG  \
 626    | CATEGORY_MASK_UTF_8_SIG)
 627
 628 /* Table of coding categories (Lisp symbols).  This variable is for
 629    internal use only.  */
 630 static Lisp_Object Vcoding_category_table;
 631
 632 /* Table of coding-categories ordered by priority.  */
 633 static enum coding_category coding_priorities[coding_category_max];
 634
 635 /* Nth element is a coding context for the coding system bound to the
 636    Nth coding category.  */
 637 static struct coding_system coding_categories[coding_category_max];
 638
 639 /*** Commonly used macros and functions ***/
 640
 641 #ifndef min
 642 #define min(a, b) ((a) < (b) ? (a) : (b))
 643 #endif
 644 #ifndef max
 645 #define max(a, b) ((a) > (b) ? (a) : (b))
 646 #endif
 647
 648 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 649   do {                                                  \
 650     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 651     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 652   } while (0)
 653
 654
 655 /* Safely get one byte from the source text pointed by SRC which ends
 656    at SRC_END, and set C to that byte.  If there are not enough bytes
 657    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 658    and a multibyte character is found at SRC, set C to the
 659    negative value of the character code.  The caller should declare
 660    and set these variables appropriately in advance:
 661         src, src_end, multibytep */
 662
 663 #define ONE_MORE_BYTE(c)                                \
 664   do {                                                  \
 665     if (src == src_end)                                 \
 666       {                                                 \
 667         if (src_base < src)                             \
 668           record_conversion_result                      \
 669             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 670         goto no_more_source;                            \
 671       }                                                 \
 672     c = *src++;                                         \
 673     if (multibytep && (c & 0x80))                       \
 674       {                                                 \
 675         if ((c & 0xFE) == 0xC0)                         \
 676           c = ((c & 1) << 6) | *src++;                  \
 677         else                                            \
 678           {                                             \
 679             src--;                                      \
 680             c = - string_char (src, &src, NULL);        \
 681             record_conversion_result                    \
 682               (coding, CODING_RESULT_INVALID_SRC);      \
 683           }                                             \
 684       }                                                 \
 685     consumed_chars++;                                   \
 686   } while (0)
 687
 688 /* Safely get two bytes from the source text pointed by SRC which ends
 689    at SRC_END, and set C1 and C2 to those bytes while skipping the
 690    heading multibyte characters.  If there are not enough bytes in the
 691    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 692    a multibyte character is found for C2, set C2 to the negative value
 693    of the character code.  The caller should declare and set these
 694    variables appropriately in advance:
 695         src, src_end, multibytep
 696    It is intended that this macro is used in detect_coding_utf_16.  */
 697
 698 #define TWO_MORE_BYTES(c1, c2)                          \
 699   do {                                                  \
 700     do {                                                \
 701       if (src == src_end)                               \
 702         goto no_more_source;                            \
 703       c1 = *src++;                                      \
 704       if (multibytep && (c1 & 0x80))                    \
 705         {                                               \
 706           if ((c1 & 0xFE) == 0xC0)                      \
 707             c1 = ((c1 & 1) << 6) | *src++;              \
 708           else                                          \
 709             {                                           \
 710               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 711               c1 = -1;                                  \
 712             }                                           \
 713         }                                               \
 714     } while (c1 < 0);                                   \
 715     if (src == src_end)                                 \
 716       goto no_more_source;                              \
 717     c2 = *src++;                                        \
 718     if (multibytep && (c2 & 0x80))                      \
 719       {                                                 \
 720         if ((c2 & 0xFE) == 0xC0)                        \
 721           c2 = ((c2 & 1) << 6) | *src++;                \
 722         else                                            \
 723           c2 = -1;                                      \
 724       }                                                 \
 725   } while (0)
 726
 727
 728 /* Store a byte C in the place pointed by DST and increment DST to the
 729    next free point, and increment PRODUCED_CHARS.  The caller should
 730    assure that C is 0..127, and declare and set the variable `dst'
 731    appropriately in advance.
 732 */
 733
 734
 735 #define EMIT_ONE_ASCII_BYTE(c)  \
 736   do {                          \
 737     produced_chars++;           \
 738     *dst++ = (c);               \
 739   } while (0)
 740
 741
 742 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 743
 744 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 745   do {                                  \
 746     produced_chars += 2;                \
 747     *dst++ = (c1), *dst++ = (c2);       \
 748   } while (0)
 749
 750
 751 /* Store a byte C in the place pointed by DST and increment DST to the
 752    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 753    store in an appropriate multibyte form.  The caller should
 754    declare and set the variables `dst' and `multibytep' appropriately
 755    in advance.  */
 756
 757 #define EMIT_ONE_BYTE(c)                \
 758   do {                                  \
 759     produced_chars++;                   \
 760     if (multibytep)                     \
 761       {                                 \
 762         unsigned ch = (c);              \
 763         if (ch >= 0x80)                 \
 764           ch = BYTE8_TO_CHAR (ch);      \
 765         CHAR_STRING_ADVANCE (ch, dst);  \
 766       }                                 \
 767     else                                \
 768       *dst++ = (c);                     \
 769   } while (0)
 770
 771
 772 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 773
 774 #define EMIT_TWO_BYTES(c1, c2)          \
 775   do {                                  \
 776     produced_chars += 2;                \
 777     if (multibytep)                     \
 778       {                                 \
 779         unsigned ch;                    \
 780                                         \
 781         ch = (c1);                      \
 782         if (ch >= 0x80)                 \
 783           ch = BYTE8_TO_CHAR (ch);      \
 784         CHAR_STRING_ADVANCE (ch, dst);  \
 785         ch = (c2);                      \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789       }                                 \
 790     else                                \
 791       {                                 \
 792         *dst++ = (c1);                  \
 793         *dst++ = (c2);                  \
 794       }                                 \
 795   } while (0)
 796
 797
 798 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 799   do {                                  \
 800     EMIT_ONE_BYTE (c1);                 \
 801     EMIT_TWO_BYTES (c2, c3);            \
 802   } while (0)
 803
 804
 805 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 806   do {                                          \
 807     EMIT_TWO_BYTES (c1, c2);                    \
 808     EMIT_TWO_BYTES (c3, c4);                    \
 809   } while (0)
 810
 811
 812 static void
 813 record_conversion_result (struct coding_system *coding,
 814                           enum coding_result_code result)
 815 {
 816   coding->result = result;
 817   switch (result)
 818     {
 819     case CODING_RESULT_INSUFFICIENT_SRC:
 820       Vlast_code_conversion_error = Qinsufficient_source;
 821       break;
 822     case CODING_RESULT_INCONSISTENT_EOL:
 823       Vlast_code_conversion_error = Qinconsistent_eol;
 824       break;
 825     case CODING_RESULT_INVALID_SRC:
 826       Vlast_code_conversion_error = Qinvalid_source;
 827       break;
 828     case CODING_RESULT_INTERRUPT:
 829       Vlast_code_conversion_error = Qinterrupted;
 830       break;
 831     case CODING_RESULT_INSUFFICIENT_MEM:
 832       Vlast_code_conversion_error = Qinsufficient_memory;
 833       break;
 834     case CODING_RESULT_INSUFFICIENT_DST:
 835       /* Don't record this error in Vlast_code_conversion_error
 836          because it happens just temporarily and is resolved when the
 837          whole conversion is finished.  */
 838       break;
 839     case CODING_RESULT_SUCCESS:
 840       break;
 841     default:
 842       Vlast_code_conversion_error = intern ("Unknown error");
 843     }
 844 }
 845
 846 /* These wrapper macros are used to preserve validity of pointers into
 847    buffer text across calls to decode_char, encode_char, etc, which
 848    could cause relocation of buffers if it loads a charset map,
 849    because loading a charset map allocates large structures.  */
 850
 851 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 852   do {                                                                       \
 853     ptrdiff_t offset;                                                        \
 854                                                                              \
 855     charset_map_loaded = 0;                                                  \
 856     c = DECODE_CHAR (charset, code);                                         \
 857     if (charset_map_loaded                                                   \
 858         && (offset = coding_change_source (coding)))                         \
 859       {                                                                      \
 860         src += offset;                                                       \
 861         src_base += offset;                                                  \
 862         src_end += offset;                                                   \
 863       }                                                                      \
 864   } while (0)
 865
 866 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     code = ENCODE_CHAR (charset, c);                                    \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     charset = char_charset (c, charset_list, code_return);              \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 895   do {                                                                  \
 896     ptrdiff_t offset;                                                   \
 897                                                                         \
 898     charset_map_loaded = 0;                                             \
 899     result = CHAR_CHARSET_P (c, charset);                               \
 900     if (charset_map_loaded                                              \
 901         && (offset = coding_change_destination (coding)))               \
 902       {                                                                 \
 903         dst += offset;                                                  \
 904         dst_end += offset;                                              \
 905       }                                                                 \
 906   } while (0)
 907
 908
 909 /* If there are at least BYTES length of room at dst, allocate memory
 910    for coding->destination and update dst and dst_end.  We don't have
 911    to take care of coding->source which will be relocated.  It is
 912    handled by calling coding_set_source in encode_coding.  */
 913
 914 #define ASSURE_DESTINATION(bytes)                               \
 915   do {                                                          \
 916     if (dst + (bytes) >= dst_end)                               \
 917       {                                                         \
 918         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 919                                                                 \
 920         dst = alloc_destination (coding, more_bytes, dst);      \
 921         dst_end = coding->destination + coding->dst_bytes;      \
 922       }                                                         \
 923   } while (0)
 924
 925
 926 /* Store multibyte form of the character C in P, and advance P to the
 927    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 928    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 929    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 930
 931 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 932
 933 /* Return the character code of character whose multibyte form is at
 934    P, and advance P to the end of the multibyte form.  This used to be
 935    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 936    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 937
 938 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 939
 940 /* Set coding->source from coding->src_object.  */
 941
 942 static void
 943 coding_set_source (struct coding_system *coding)
 944 {
 945   if (BUFFERP (coding->src_object))
 946     {
 947       struct buffer *buf = XBUFFER (coding->src_object);
 948
 949       if (coding->src_pos < 0)
 950         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 951       else
 952         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 953     }
 954   else if (STRINGP (coding->src_object))
 955     {
 956       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 957     }
 958   else
 959     {
 960       /* Otherwise, the source is C string and is never relocated
 961          automatically.  Thus we don't have to update anything.  */
 962     }
 963 }
 964
 965
 966 /* Set coding->source from coding->src_object, and return how many
 967    bytes coding->source was changed.  */
 968
 969 static ptrdiff_t
 970 coding_change_source (struct coding_system *coding)
 971 {
 972   const unsigned char *orig = coding->source;
 973   coding_set_source (coding);
 974   return coding->source - orig;
 975 }
 976
 977
 978 /* Set coding->destination from coding->dst_object.  */
 979
 980 static void
 981 coding_set_destination (struct coding_system *coding)
 982 {
 983   if (BUFFERP (coding->dst_object))
 984     {
 985       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 986         {
 987           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 988           coding->dst_bytes = (GAP_END_ADDR
 989                                - (coding->src_bytes - coding->consumed)
 990                                - coding->destination);
 991         }
 992       else
 993         {
 994           /* We are sure that coding->dst_pos_byte is before the gap
 995              of the buffer. */
 996           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 997                                  + coding->dst_pos_byte - BEG_BYTE);
 998           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 999                                - coding->destination);
1000         }
1001     }
1002   else
1003     {
1004       /* Otherwise, the destination is C string and is never relocated
1005          automatically.  Thus we don't have to update anything.  */
1006     }
1007 }
1008
1009
1010 /* Set coding->destination from coding->dst_object, and return how
1011    many bytes coding->destination was changed.  */
1012
1013 static ptrdiff_t
1014 coding_change_destination (struct coding_system *coding)
1015 {
1016   const unsigned char *orig = coding->destination;
1017   coding_set_destination (coding);
1018   return coding->destination - orig;
1019 }
1020
1021
1022 static void
1023 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1024 {
1025   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1026     string_overflow ();
1027   coding->destination = xrealloc (coding->destination,
1028                                   coding->dst_bytes + bytes);
1029   coding->dst_bytes += bytes;
1030 }
1031
1032 static void
1033 coding_alloc_by_making_gap (struct coding_system *coding,
1034                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1035 {
1036   if (EQ (coding->src_object, coding->dst_object))
1037     {
1038       /* The gap may contain the produced data at the head and not-yet
1039          consumed data at the tail.  To preserve those data, we at
1040          first make the gap size to zero, then increase the gap
1041          size.  */
1042       ptrdiff_t add = GAP_SIZE;
1043
1044       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1045       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1046       make_gap (bytes);
1047       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1048       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1049     }
1050   else
1051     {
1052       Lisp_Object this_buffer;
1053
1054       this_buffer = Fcurrent_buffer ();
1055       set_buffer_internal (XBUFFER (coding->dst_object));
1056       make_gap (bytes);
1057       set_buffer_internal (XBUFFER (this_buffer));
1058     }
1059 }
1060
1061
1062 static unsigned char *
1063 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1064                    unsigned char *dst)
1065 {
1066   ptrdiff_t offset = dst - coding->destination;
1067
1068   if (BUFFERP (coding->dst_object))
1069     {
1070       struct buffer *buf = XBUFFER (coding->dst_object);
1071
1072       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1073     }
1074   else
1075     coding_alloc_by_realloc (coding, nbytes);
1076   coding_set_destination (coding);
1077   dst = coding->destination + offset;
1078   return dst;
1079 }
1080
1081 /** Macros for annotations.  */
1082
1083 /* An annotation data is stored in the array coding->charbuf in this
1084    format:
1085      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1086    LENGTH is the number of elements in the annotation.
1087    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1088    NCHARS is the number of characters in the text annotated.
1089
1090    The format of the following elements depend on ANNOTATION_MASK.
1091
1092    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1093    follows:
1094      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1095
1096    NBYTES is the number of bytes specified in the header part of
1097    old-style emacs-mule encoding, or 0 for the other kind of
1098    composition.
1099
1100    METHOD is one of enum composition_method.
1101
1102    Optional COMPOSITION-COMPONENTS are characters and composition
1103    rules.
1104
1105    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1106    follows.
1107
1108    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1109    recover from an invalid annotation, and should be skipped by
1110    produce_annotation.  */
1111
1112 /* Maximum length of the header of annotation data.  */
1113 #define MAX_ANNOTATION_LENGTH 5
1114
1115 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1116   do {                                                  \
1117     *(buf)++ = -(len);                                  \
1118     *(buf)++ = (mask);                                  \
1119     *(buf)++ = (nchars);                                \
1120     coding->annotated = 1;                              \
1121   } while (0);
1122
1123 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1124   do {                                                                      \
1125     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1126     *buf++ = nbytes;                                                        \
1127     *buf++ = method;                                                        \
1128   } while (0)
1129
1130
1131 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1132   do {                                                                  \
1133     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1134     *buf++ = id;                                                        \
1135   } while (0)
1136
1137 \f
1138 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1139
1140
1141
1142 \f
1143 /*** 3. UTF-8 ***/
1144
1145 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1146    Return true if a text is encoded in UTF-8.  */
1147
1148 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1149 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1150 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1151 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1152 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1153 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1154
1155 #define UTF_8_BOM_1 0xEF
1156 #define UTF_8_BOM_2 0xBB
1157 #define UTF_8_BOM_3 0xBF
1158
1159 static bool
1160 detect_coding_utf_8 (struct coding_system *coding,
1161                      struct coding_detection_info *detect_info)
1162 {
1163   const unsigned char *src = coding->source, *src_base;
1164   const unsigned char *src_end = coding->source + coding->src_bytes;
1165   bool multibytep = coding->src_multibyte;
1166   ptrdiff_t consumed_chars = 0;
1167   bool bom_found = 0;
1168   bool found = 0;
1169
1170   detect_info->checked |= CATEGORY_MASK_UTF_8;
1171   /* A coding system of this category is always ASCII compatible.  */
1172   src += coding->head_ascii;
1173
1174   while (1)
1175     {
1176       int c, c1, c2, c3, c4;
1177
1178       src_base = src;
1179       ONE_MORE_BYTE (c);
1180       if (c < 0 || UTF_8_1_OCTET_P (c))
1181         continue;
1182       ONE_MORE_BYTE (c1);
1183       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1184         break;
1185       if (UTF_8_2_OCTET_LEADING_P (c))
1186         {
1187           found = 1;
1188           continue;
1189         }
1190       ONE_MORE_BYTE (c2);
1191       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1192         break;
1193       if (UTF_8_3_OCTET_LEADING_P (c))
1194         {
1195           found = 1;
1196           if (src_base == coding->source
1197               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1198             bom_found = 1;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c3);
1202       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1203         break;
1204       if (UTF_8_4_OCTET_LEADING_P (c))
1205         {
1206           found = 1;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c4);
1210       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1211         break;
1212       if (UTF_8_5_OCTET_LEADING_P (c))
1213         {
1214           found = 1;
1215           continue;
1216         }
1217       break;
1218     }
1219   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1220   return 0;
1221
1222  no_more_source:
1223   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1224     {
1225       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1226       return 0;
1227     }
1228   if (bom_found)
1229     {
1230       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1231       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1232     }
1233   else
1234     {
1235       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1236       if (found)
1237         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1238     }
1239   return 1;
1240 }
1241
1242
1243 static void
1244 decode_coding_utf_8 (struct coding_system *coding)
1245 {
1246   const unsigned char *src = coding->source + coding->consumed;
1247   const unsigned char *src_end = coding->source + coding->src_bytes;
1248   const unsigned char *src_base;
1249   int *charbuf = coding->charbuf + coding->charbuf_used;
1250   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1251   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1252   bool multibytep = coding->src_multibyte;
1253   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1254   bool eol_dos
1255     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1256   int byte_after_cr = -1;
1257
1258   if (bom != utf_without_bom)
1259     {
1260       int c1, c2, c3;
1261
1262       src_base = src;
1263       ONE_MORE_BYTE (c1);
1264       if (! UTF_8_3_OCTET_LEADING_P (c1))
1265         src = src_base;
1266       else
1267         {
1268           ONE_MORE_BYTE (c2);
1269           if (! UTF_8_EXTRA_OCTET_P (c2))
1270             src = src_base;
1271           else
1272             {
1273               ONE_MORE_BYTE (c3);
1274               if (! UTF_8_EXTRA_OCTET_P (c3))
1275                 src = src_base;
1276               else
1277                 {
1278                   if ((c1 != UTF_8_BOM_1)
1279                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1280                     src = src_base;
1281                   else
1282                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1283                 }
1284             }
1285         }
1286     }
1287   CODING_UTF_8_BOM (coding) = utf_without_bom;
1288
1289   while (1)
1290     {
1291       int c, c1, c2, c3, c4, c5;
1292
1293       src_base = src;
1294       consumed_chars_base = consumed_chars;
1295
1296       if (charbuf >= charbuf_end)
1297         {
1298           if (byte_after_cr >= 0)
1299             src_base--;
1300           break;
1301         }
1302
1303       if (byte_after_cr >= 0)
1304         c1 = byte_after_cr, byte_after_cr = -1;
1305       else
1306         ONE_MORE_BYTE (c1);
1307       if (c1 < 0)
1308         {
1309           c = - c1;
1310         }
1311       else if (UTF_8_1_OCTET_P (c1))
1312         {
1313           if (eol_dos && c1 == '\r')
1314             ONE_MORE_BYTE (byte_after_cr);
1315           c = c1;
1316         }
1317       else
1318         {
1319           ONE_MORE_BYTE (c2);
1320           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1321             goto invalid_code;
1322           if (UTF_8_2_OCTET_LEADING_P (c1))
1323             {
1324               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1325               /* Reject overlong sequences here and below.  Encoders
1326                  producing them are incorrect, they can be misleading,
1327                  and they mess up read/write invariance.  */
1328               if (c < 128)
1329                 goto invalid_code;
1330             }
1331           else
1332             {
1333               ONE_MORE_BYTE (c3);
1334               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1335                 goto invalid_code;
1336               if (UTF_8_3_OCTET_LEADING_P (c1))
1337                 {
1338                   c = (((c1 & 0xF) << 12)
1339                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1340                   if (c < 0x800
1341                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1342                     goto invalid_code;
1343                 }
1344               else
1345                 {
1346                   ONE_MORE_BYTE (c4);
1347                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1348                     goto invalid_code;
1349                   if (UTF_8_4_OCTET_LEADING_P (c1))
1350                     {
1351                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1352                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1353                     if (c < 0x10000)
1354                       goto invalid_code;
1355                     }
1356                   else
1357                     {
1358                       ONE_MORE_BYTE (c5);
1359                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1360                         goto invalid_code;
1361                       if (UTF_8_5_OCTET_LEADING_P (c1))
1362                         {
1363                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1364                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1365                                | (c5 & 0x3F));
1366                           if ((c > MAX_CHAR) || (c < 0x200000))
1367                             goto invalid_code;
1368                         }
1369                       else
1370                         goto invalid_code;
1371                     }
1372                 }
1373             }
1374         }
1375
1376       *charbuf++ = c;
1377       continue;
1378
1379     invalid_code:
1380       src = src_base;
1381       consumed_chars = consumed_chars_base;
1382       ONE_MORE_BYTE (c);
1383       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1384       coding->errors++;
1385     }
1386
1387  no_more_source:
1388   coding->consumed_char += consumed_chars_base;
1389   coding->consumed = src_base - coding->source;
1390   coding->charbuf_used = charbuf - coding->charbuf;
1391 }
1392
1393
1394 static bool
1395 encode_coding_utf_8 (struct coding_system *coding)
1396 {
1397   bool multibytep = coding->dst_multibyte;
1398   int *charbuf = coding->charbuf;
1399   int *charbuf_end = charbuf + coding->charbuf_used;
1400   unsigned char *dst = coding->destination + coding->produced;
1401   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1402   ptrdiff_t produced_chars = 0;
1403   int c;
1404
1405   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1406     {
1407       ASSURE_DESTINATION (3);
1408       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1409       CODING_UTF_8_BOM (coding) = utf_without_bom;
1410     }
1411
1412   if (multibytep)
1413     {
1414       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1415
1416       while (charbuf < charbuf_end)
1417         {
1418           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1419
1420           ASSURE_DESTINATION (safe_room);
1421           c = *charbuf++;
1422           if (CHAR_BYTE8_P (c))
1423             {
1424               c = CHAR_TO_BYTE8 (c);
1425               EMIT_ONE_BYTE (c);
1426             }
1427           else
1428             {
1429               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1430               for (p = str; p < pend; p++)
1431                 EMIT_ONE_BYTE (*p);
1432             }
1433         }
1434     }
1435   else
1436     {
1437       int safe_room = MAX_MULTIBYTE_LENGTH;
1438
1439       while (charbuf < charbuf_end)
1440         {
1441           ASSURE_DESTINATION (safe_room);
1442           c = *charbuf++;
1443           if (CHAR_BYTE8_P (c))
1444             *dst++ = CHAR_TO_BYTE8 (c);
1445           else
1446             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1447           produced_chars++;
1448         }
1449     }
1450   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1451   coding->produced_char += produced_chars;
1452   coding->produced = dst - coding->destination;
1453   return 0;
1454 }
1455
1456
1457 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1458    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1459
1460 #define UTF_16_HIGH_SURROGATE_P(val) \
1461   (((val) & 0xFC00) == 0xD800)
1462
1463 #define UTF_16_LOW_SURROGATE_P(val) \
1464   (((val) & 0xFC00) == 0xDC00)
1465
1466
1467 static bool
1468 detect_coding_utf_16 (struct coding_system *coding,
1469                       struct coding_detection_info *detect_info)
1470 {
1471   const unsigned char *src = coding->source;
1472   const unsigned char *src_end = coding->source + coding->src_bytes;
1473   bool multibytep = coding->src_multibyte;
1474   int c1, c2;
1475
1476   detect_info->checked |= CATEGORY_MASK_UTF_16;
1477   if (coding->mode & CODING_MODE_LAST_BLOCK
1478       && (coding->src_chars & 1))
1479     {
1480       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1481       return 0;
1482     }
1483
1484   TWO_MORE_BYTES (c1, c2);
1485   if ((c1 == 0xFF) && (c2 == 0xFE))
1486     {
1487       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1488                              | CATEGORY_MASK_UTF_16_AUTO);
1489       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1490                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1491                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1492     }
1493   else if ((c1 == 0xFE) && (c2 == 0xFF))
1494     {
1495       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1496                              | CATEGORY_MASK_UTF_16_AUTO);
1497       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1498                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1499                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1500     }
1501   else if (c2 < 0)
1502     {
1503       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1504       return 0;
1505     }
1506   else
1507     {
1508       /* We check the dispersion of Eth and Oth bytes where E is even and
1509          O is odd.  If both are high, we assume binary data.*/
1510       unsigned char e[256], o[256];
1511       unsigned e_num = 1, o_num = 1;
1512
1513       memset (e, 0, 256);
1514       memset (o, 0, 256);
1515       e[c1] = 1;
1516       o[c2] = 1;
1517
1518       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1519                                 |CATEGORY_MASK_UTF_16_BE
1520                                 | CATEGORY_MASK_UTF_16_LE);
1521
1522       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1523              != CATEGORY_MASK_UTF_16)
1524         {
1525           TWO_MORE_BYTES (c1, c2);
1526           if (c2 < 0)
1527             break;
1528           if (! e[c1])
1529             {
1530               e[c1] = 1;
1531               e_num++;
1532               if (e_num >= 128)
1533                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1534             }
1535           if (! o[c2])
1536             {
1537               o[c2] = 1;
1538               o_num++;
1539               if (o_num >= 128)
1540                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1541             }
1542         }
1543       return 0;
1544     }
1545
1546  no_more_source:
1547   return 1;
1548 }
1549
1550 static void
1551 decode_coding_utf_16 (struct coding_system *coding)
1552 {
1553   const unsigned char *src = coding->source + coding->consumed;
1554   const unsigned char *src_end = coding->source + coding->src_bytes;
1555   const unsigned char *src_base;
1556   int *charbuf = coding->charbuf + coding->charbuf_used;
1557   /* We may produces at most 3 chars in one loop.  */
1558   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1559   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1560   bool multibytep = coding->src_multibyte;
1561   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1562   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1563   int surrogate = CODING_UTF_16_SURROGATE (coding);
1564   bool eol_dos
1565     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1566   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1567
1568   if (bom == utf_with_bom)
1569     {
1570       int c, c1, c2;
1571
1572       src_base = src;
1573       ONE_MORE_BYTE (c1);
1574       ONE_MORE_BYTE (c2);
1575       c = (c1 << 8) | c2;
1576
1577       if (endian == utf_16_big_endian
1578           ? c != 0xFEFF : c != 0xFFFE)
1579         {
1580           /* The first two bytes are not BOM.  Treat them as bytes
1581              for a normal character.  */
1582           src = src_base;
1583           coding->errors++;
1584         }
1585       CODING_UTF_16_BOM (coding) = utf_without_bom;
1586     }
1587   else if (bom == utf_detect_bom)
1588     {
1589       /* We have already tried to detect BOM and failed in
1590          detect_coding.  */
1591       CODING_UTF_16_BOM (coding) = utf_without_bom;
1592     }
1593
1594   while (1)
1595     {
1596       int c, c1, c2;
1597
1598       src_base = src;
1599       consumed_chars_base = consumed_chars;
1600
1601       if (charbuf >= charbuf_end)
1602         {
1603           if (byte_after_cr1 >= 0)
1604             src_base -= 2;
1605           break;
1606         }
1607
1608       if (byte_after_cr1 >= 0)
1609         c1 = byte_after_cr1, byte_after_cr1 = -1;
1610       else
1611         ONE_MORE_BYTE (c1);
1612       if (c1 < 0)
1613         {
1614           *charbuf++ = -c1;
1615           continue;
1616         }
1617       if (byte_after_cr2 >= 0)
1618         c2 = byte_after_cr2, byte_after_cr2 = -1;
1619       else
1620         ONE_MORE_BYTE (c2);
1621       if (c2 < 0)
1622         {
1623           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1624           *charbuf++ = -c2;
1625           continue;
1626         }
1627       c = (endian == utf_16_big_endian
1628            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1629
1630       if (surrogate)
1631         {
1632           if (! UTF_16_LOW_SURROGATE_P (c))
1633             {
1634               if (endian == utf_16_big_endian)
1635                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1636               else
1637                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1638               *charbuf++ = c1;
1639               *charbuf++ = c2;
1640               coding->errors++;
1641               if (UTF_16_HIGH_SURROGATE_P (c))
1642                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1643               else
1644                 *charbuf++ = c;
1645             }
1646           else
1647             {
1648               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1649               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1650               *charbuf++ = 0x10000 + c;
1651             }
1652         }
1653       else
1654         {
1655           if (UTF_16_HIGH_SURROGATE_P (c))
1656             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1657           else
1658             {
1659               if (eol_dos && c == '\r')
1660                 {
1661                   ONE_MORE_BYTE (byte_after_cr1);
1662                   ONE_MORE_BYTE (byte_after_cr2);
1663                 }
1664               *charbuf++ = c;
1665             }
1666         }
1667     }
1668
1669  no_more_source:
1670   coding->consumed_char += consumed_chars_base;
1671   coding->consumed = src_base - coding->source;
1672   coding->charbuf_used = charbuf - coding->charbuf;
1673 }
1674
1675 static bool
1676 encode_coding_utf_16 (struct coding_system *coding)
1677 {
1678   bool multibytep = coding->dst_multibyte;
1679   int *charbuf = coding->charbuf;
1680   int *charbuf_end = charbuf + coding->charbuf_used;
1681   unsigned char *dst = coding->destination + coding->produced;
1682   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1683   int safe_room = 8;
1684   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1685   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1686   ptrdiff_t produced_chars = 0;
1687   int c;
1688
1689   if (bom != utf_without_bom)
1690     {
1691       ASSURE_DESTINATION (safe_room);
1692       if (big_endian)
1693         EMIT_TWO_BYTES (0xFE, 0xFF);
1694       else
1695         EMIT_TWO_BYTES (0xFF, 0xFE);
1696       CODING_UTF_16_BOM (coding) = utf_without_bom;
1697     }
1698
1699   while (charbuf < charbuf_end)
1700     {
1701       ASSURE_DESTINATION (safe_room);
1702       c = *charbuf++;
1703       if (c > MAX_UNICODE_CHAR)
1704         c = coding->default_char;
1705
1706       if (c < 0x10000)
1707         {
1708           if (big_endian)
1709             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1710           else
1711             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1712         }
1713       else
1714         {
1715           int c1, c2;
1716
1717           c -= 0x10000;
1718           c1 = (c >> 10) + 0xD800;
1719           c2 = (c & 0x3FF) + 0xDC00;
1720           if (big_endian)
1721             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1722           else
1723             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1724         }
1725     }
1726   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1727   coding->produced = dst - coding->destination;
1728   coding->produced_char += produced_chars;
1729   return 0;
1730 }
1731
1732 \f
1733 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1734
1735 /* Emacs' internal format for representation of multiple character
1736    sets is a kind of multi-byte encoding, i.e. characters are
1737    represented by variable-length sequences of one-byte codes.
1738
1739    ASCII characters and control characters (e.g. `tab', `newline') are
1740    represented by one-byte sequences which are their ASCII codes, in
1741    the range 0x00 through 0x7F.
1742
1743    8-bit characters of the range 0x80..0x9F are represented by
1744    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1745    code + 0x20).
1746
1747    8-bit characters of the range 0xA0..0xFF are represented by
1748    one-byte sequences which are their 8-bit code.
1749
1750    The other characters are represented by a sequence of `base
1751    leading-code', optional `extended leading-code', and one or two
1752    `position-code's.  The length of the sequence is determined by the
1753    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1754    whereas extended leading-code and position-code take the range 0xA0
1755    through 0xFF.  See `charset.h' for more details about leading-code
1756    and position-code.
1757
1758    --- CODE RANGE of Emacs' internal format ---
1759    character set        range
1760    -------------        -----
1761    ascii                0x00..0x7F
1762    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1763    eight-bit-graphic    0xA0..0xBF
1764    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1765    ---------------------------------------------
1766
1767    As this is the internal character representation, the format is
1768    usually not used externally (i.e. in a file or in a data sent to a
1769    process).  But, it is possible to have a text externally in this
1770    format (i.e. by encoding by the coding system `emacs-mule').
1771
1772    In that case, a sequence of one-byte codes has a slightly different
1773    form.
1774
1775    At first, all characters in eight-bit-control are represented by
1776    one-byte sequences which are their 8-bit code.
1777
1778    Next, character composition data are represented by the byte
1779    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1780    where,
1781         METHOD is 0xF2 plus one of composition method (enum
1782         composition_method),
1783
1784         BYTES is 0xA0 plus a byte length of this composition data,
1785
1786         CHARS is 0xA0 plus a number of characters composed by this
1787         data,
1788
1789         COMPONENTs are characters of multibyte form or composition
1790         rules encoded by two-byte of ASCII codes.
1791
1792    In addition, for backward compatibility, the following formats are
1793    also recognized as composition data on decoding.
1794
1795    0x80 MSEQ ...
1796    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1797
1798    Here,
1799         MSEQ is a multibyte form but in these special format:
1800           ASCII: 0xA0 ASCII_CODE+0x80,
1801           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1802         RULE is a one byte code of the range 0xA0..0xF0 that
1803         represents a composition rule.
1804   */
1805
1806 char emacs_mule_bytes[256];
1807
1808
1809 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1810    Return true if a text is encoded in 'emacs-mule'.  */
1811
1812 static bool
1813 detect_coding_emacs_mule (struct coding_system *coding,
1814                           struct coding_detection_info *detect_info)
1815 {
1816   const unsigned char *src = coding->source, *src_base;
1817   const unsigned char *src_end = coding->source + coding->src_bytes;
1818   bool multibytep = coding->src_multibyte;
1819   ptrdiff_t consumed_chars = 0;
1820   int c;
1821   int found = 0;
1822
1823   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1824   /* A coding system of this category is always ASCII compatible.  */
1825   src += coding->head_ascii;
1826
1827   while (1)
1828     {
1829       src_base = src;
1830       ONE_MORE_BYTE (c);
1831       if (c < 0)
1832         continue;
1833       if (c == 0x80)
1834         {
1835           /* Perhaps the start of composite character.  We simply skip
1836              it because analyzing it is too heavy for detecting.  But,
1837              at least, we check that the composite character
1838              constitutes of more than 4 bytes.  */
1839           const unsigned char *src_start;
1840
1841         repeat:
1842           src_start = src;
1843           do
1844             {
1845               ONE_MORE_BYTE (c);
1846             }
1847           while (c >= 0xA0);
1848
1849           if (src - src_start <= 4)
1850             break;
1851           found = CATEGORY_MASK_EMACS_MULE;
1852           if (c == 0x80)
1853             goto repeat;
1854         }
1855
1856       if (c < 0x80)
1857         {
1858           if (c < 0x20
1859               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1860             break;
1861         }
1862       else
1863         {
1864           int more_bytes = emacs_mule_bytes[c] - 1;
1865
1866           while (more_bytes > 0)
1867             {
1868               ONE_MORE_BYTE (c);
1869               if (c < 0xA0)
1870                 {
1871                   src--;        /* Unread the last byte.  */
1872                   break;
1873                 }
1874               more_bytes--;
1875             }
1876           if (more_bytes != 0)
1877             break;
1878           found = CATEGORY_MASK_EMACS_MULE;
1879         }
1880     }
1881   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882   return 0;
1883
1884  no_more_source:
1885   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1886     {
1887       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1888       return 0;
1889     }
1890   detect_info->found |= found;
1891   return 1;
1892 }
1893
1894
1895 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1896    character.  If CMP_STATUS indicates that we must expect MSEQ or
1897    RULE described above, decode it and return the negative value of
1898    the decoded character or rule.  If an invalid byte is found, return
1899    -1.  If SRC is too short, return -2.  */
1900
1901 static int
1902 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1903                  int *nbytes, int *nchars, int *id,
1904                  struct composition_status *cmp_status)
1905 {
1906   const unsigned char *src_end = coding->source + coding->src_bytes;
1907   const unsigned char *src_base = src;
1908   bool multibytep = coding->src_multibyte;
1909   int charset_ID;
1910   unsigned code;
1911   int c;
1912   int consumed_chars = 0;
1913   bool mseq_found = 0;
1914
1915   ONE_MORE_BYTE (c);
1916   if (c < 0)
1917     {
1918       c = -c;
1919       charset_ID = emacs_mule_charset[0];
1920     }
1921   else
1922     {
1923       if (c >= 0xA0)
1924         {
1925           if (cmp_status->state != COMPOSING_NO
1926               && cmp_status->old_form)
1927             {
1928               if (cmp_status->state == COMPOSING_CHAR)
1929                 {
1930                   if (c == 0xA0)
1931                     {
1932                       ONE_MORE_BYTE (c);
1933                       c -= 0x80;
1934                       if (c < 0)
1935                         goto invalid_code;
1936                     }
1937                   else
1938                     c -= 0x20;
1939                   mseq_found = 1;
1940                 }
1941               else
1942                 {
1943                   *nbytes = src - src_base;
1944                   *nchars = consumed_chars;
1945                   return -c;
1946                 }
1947             }
1948           else
1949             goto invalid_code;
1950         }
1951
1952       switch (emacs_mule_bytes[c])
1953         {
1954         case 2:
1955           if ((charset_ID = emacs_mule_charset[c]) < 0)
1956             goto invalid_code;
1957           ONE_MORE_BYTE (c);
1958           if (c < 0xA0)
1959             goto invalid_code;
1960           code = c & 0x7F;
1961           break;
1962
1963         case 3:
1964           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1965               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1966             {
1967               ONE_MORE_BYTE (c);
1968               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1969                 goto invalid_code;
1970               ONE_MORE_BYTE (c);
1971               if (c < 0xA0)
1972                 goto invalid_code;
1973               code = c & 0x7F;
1974             }
1975           else
1976             {
1977               if ((charset_ID = emacs_mule_charset[c]) < 0)
1978                 goto invalid_code;
1979               ONE_MORE_BYTE (c);
1980               if (c < 0xA0)
1981                 goto invalid_code;
1982               code = (c & 0x7F) << 8;
1983               ONE_MORE_BYTE (c);
1984               if (c < 0xA0)
1985                 goto invalid_code;
1986               code |= c & 0x7F;
1987             }
1988           break;
1989
1990         case 4:
1991           ONE_MORE_BYTE (c);
1992           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1993             goto invalid_code;
1994           ONE_MORE_BYTE (c);
1995           if (c < 0xA0)
1996             goto invalid_code;
1997           code = (c & 0x7F) << 8;
1998           ONE_MORE_BYTE (c);
1999           if (c < 0xA0)
2000             goto invalid_code;
2001           code |= c & 0x7F;
2002           break;
2003
2004         case 1:
2005           code = c;
2006           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2007           break;
2008
2009         default:
2010           emacs_abort ();
2011         }
2012       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2013                           CHARSET_FROM_ID (charset_ID), code, c);
2014       if (c < 0)
2015         goto invalid_code;
2016     }
2017   *nbytes = src - src_base;
2018   *nchars = consumed_chars;
2019   if (id)
2020     *id = charset_ID;
2021   return (mseq_found ? -c : c);
2022
2023  no_more_source:
2024   return -2;
2025
2026  invalid_code:
2027   return -1;
2028 }
2029
2030
2031 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2032
2033 /* Handle these composition sequence ('|': the end of header elements,
2034    BYTES and CHARS >= 0xA0):
2035
2036    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2037    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2038    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2039
2040    and these old form:
2041
2042    (4) relative composition: 0x80 | MSEQ ... MSEQ
2043    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2044
2045    When the starter 0x80 and the following header elements are found,
2046    this annotation header is produced.
2047
2048         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2049
2050    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2051    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2052
2053    Then, upon reading the following elements, these codes are produced
2054    until the composition end is found:
2055
2056    (1) CHAR ... CHAR
2057    (2) ALT ... ALT CHAR ... CHAR
2058    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2059    (4) CHAR ... CHAR
2060    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2061
2062    When the composition end is found, LENGTH and NCHARS in the
2063    annotation header is updated as below:
2064
2065    (1) LENGTH: unchanged, NCHARS: unchanged
2066    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2067    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2068    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2069    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2070
2071    If an error is found while composing, the annotation header is
2072    changed to the original composition header (plus filler -1s) as
2073    below:
2074
2075    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2076    (5)          [ 0x80 0xFF -1 -1- -1 ]
2077
2078    and the sequence [ -2 DECODED-RULE ] is changed to the original
2079    byte sequence as below:
2080         o the original byte sequence is B: [ B -1 ]
2081         o the original byte sequence is B1 B2: [ B1 B2 ]
2082
2083    Most of the routines are implemented by macros because many
2084    variables and labels in the caller decode_coding_emacs_mule must be
2085    accessible, and they are usually called just once (thus doesn't
2086    increase the size of compiled object).  */
2087
2088 /* Decode a composition rule represented by C as a component of
2089    composition sequence of Emacs 20 style.  Set RULE to the decoded
2090    rule. */
2091
2092 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2093   do {                                                  \
2094     int gref, nref;                                     \
2095                                                         \
2096     c -= 0xA0;                                          \
2097     if (c < 0 || c >= 81)                               \
2098       goto invalid_code;                                \
2099     gref = c / 9, nref = c % 9;                         \
2100     if (gref == 4) gref = 10;                           \
2101     if (nref == 4) nref = 10;                           \
2102     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2103   } while (0)
2104
2105
2106 /* Decode a composition rule represented by C and the following byte
2107    at SRC as a component of composition sequence of Emacs 21 style.
2108    Set RULE to the decoded rule.  */
2109
2110 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2111   do {                                                  \
2112     int gref, nref;                                     \
2113                                                         \
2114     gref = c - 0x20;                                    \
2115     if (gref < 0 || gref >= 81)                         \
2116       goto invalid_code;                                \
2117     ONE_MORE_BYTE (c);                                  \
2118     nref = c - 0x20;                                    \
2119     if (nref < 0 || nref >= 81)                         \
2120       goto invalid_code;                                \
2121     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2122   } while (0)
2123
2124
2125 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2126    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2127    byte length of this composition information, CHARS is the number of
2128    characters composed by this composition.  */
2129
2130 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2131   do {                                                                  \
2132     enum composition_method method = c - 0xF2;                          \
2133     int nbytes, nchars;                                                 \
2134                                                                         \
2135     ONE_MORE_BYTE (c);                                                  \
2136     if (c < 0)                                                          \
2137       goto invalid_code;                                                \
2138     nbytes = c - 0xA0;                                                  \
2139     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2140       goto invalid_code;                                                \
2141     ONE_MORE_BYTE (c);                                                  \
2142     nchars = c - 0xA0;                                                  \
2143     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2144       goto invalid_code;                                                \
2145     cmp_status->old_form = 0;                                           \
2146     cmp_status->method = method;                                        \
2147     if (method == COMPOSITION_RELATIVE)                                 \
2148       cmp_status->state = COMPOSING_CHAR;                               \
2149     else                                                                \
2150       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2151     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2152     cmp_status->nchars = nchars;                                        \
2153     cmp_status->ncomps = nbytes - 4;                                    \
2154     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2155   } while (0)
2156
2157
2158 /* Start of Emacs 20 style format for relative composition.  */
2159
2160 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2161   do {                                                          \
2162     cmp_status->old_form = 1;                                   \
2163     cmp_status->method = COMPOSITION_RELATIVE;                  \
2164     cmp_status->state = COMPOSING_CHAR;                         \
2165     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2166     cmp_status->nchars = cmp_status->ncomps = 0;                \
2167     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2168   } while (0)
2169
2170
2171 /* Start of Emacs 20 style format for rule-base composition.  */
2172
2173 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2174   do {                                                          \
2175     cmp_status->old_form = 1;                                   \
2176     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2177     cmp_status->state = COMPOSING_CHAR;                         \
2178     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2179     cmp_status->nchars = cmp_status->ncomps = 0;                \
2180     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2181   } while (0)
2182
2183
2184 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2185   do {                                                  \
2186     const unsigned char *current_src = src;             \
2187                                                         \
2188     ONE_MORE_BYTE (c);                                  \
2189     if (c < 0)                                          \
2190       goto invalid_code;                                \
2191     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2192         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2193       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2194     else if (c < 0xA0)                                  \
2195       goto invalid_code;                                \
2196     else if (c < 0xC0)                                  \
2197       {                                                 \
2198         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2199         /* Re-read C as a composition component.  */    \
2200         src = current_src;                              \
2201       }                                                 \
2202     else if (c == 0xFF)                                 \
2203       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2204     else                                                \
2205       goto invalid_code;                                \
2206   } while (0)
2207
2208 #define EMACS_MULE_COMPOSITION_END()                            \
2209   do {                                                          \
2210     int idx = - cmp_status->length;                             \
2211                                                                 \
2212     if (cmp_status->old_form)                                   \
2213       charbuf[idx + 2] = cmp_status->nchars;                    \
2214     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2215       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2216     cmp_status->state = COMPOSING_NO;                           \
2217   } while (0)
2218
2219
2220 static int
2221 emacs_mule_finish_composition (int *charbuf,
2222                                struct composition_status *cmp_status)
2223 {
2224   int idx = - cmp_status->length;
2225   int new_chars;
2226
2227   if (cmp_status->old_form && cmp_status->nchars > 0)
2228     {
2229       charbuf[idx + 2] = cmp_status->nchars;
2230       new_chars = 0;
2231       if (cmp_status->method == COMPOSITION_WITH_RULE
2232           && cmp_status->state == COMPOSING_CHAR)
2233         {
2234           /* The last rule was invalid.  */
2235           int rule = charbuf[-1] + 0xA0;
2236
2237           charbuf[-2] = BYTE8_TO_CHAR (rule);
2238           charbuf[-1] = -1;
2239           new_chars = 1;
2240         }
2241     }
2242   else
2243     {
2244       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2245
2246       if (cmp_status->method == COMPOSITION_WITH_RULE)
2247         {
2248           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2249           charbuf[idx++] = -3;
2250           charbuf[idx++] = 0;
2251           new_chars = 1;
2252         }
2253       else
2254         {
2255           int nchars = charbuf[idx + 1] + 0xA0;
2256           int nbytes = charbuf[idx + 2] + 0xA0;
2257
2258           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2259           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2260           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2261           charbuf[idx++] = -1;
2262           new_chars = 4;
2263         }
2264     }
2265   cmp_status->state = COMPOSING_NO;
2266   return new_chars;
2267 }
2268
2269 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2270   do {                                                                    \
2271     if (cmp_status->state != COMPOSING_NO)                                \
2272       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2273   } while (0)
2274
2275
2276 static void
2277 decode_coding_emacs_mule (struct coding_system *coding)
2278 {
2279   const unsigned char *src = coding->source + coding->consumed;
2280   const unsigned char *src_end = coding->source + coding->src_bytes;
2281   const unsigned char *src_base;
2282   int *charbuf = coding->charbuf + coding->charbuf_used;
2283   /* We may produce two annotations (charset and composition) in one
2284      loop and one more charset annotation at the end.  */
2285   int *charbuf_end
2286     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2287       /* We can produce up to 2 characters in a loop.  */
2288       - 1;
2289   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2290   bool multibytep = coding->src_multibyte;
2291   ptrdiff_t char_offset = coding->produced_char;
2292   ptrdiff_t last_offset = char_offset;
2293   int last_id = charset_ascii;
2294   bool eol_dos
2295     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2296   int byte_after_cr = -1;
2297   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2298
2299   if (cmp_status->state != COMPOSING_NO)
2300     {
2301       int i;
2302
2303       if (charbuf_end - charbuf < cmp_status->length)
2304         emacs_abort ();
2305       for (i = 0; i < cmp_status->length; i++)
2306         *charbuf++ = cmp_status->carryover[i];
2307       coding->annotated = 1;
2308     }
2309
2310   while (1)
2311     {
2312       int c, id IF_LINT (= 0);
2313
2314       src_base = src;
2315       consumed_chars_base = consumed_chars;
2316
2317       if (charbuf >= charbuf_end)
2318         {
2319           if (byte_after_cr >= 0)
2320             src_base--;
2321           break;
2322         }
2323
2324       if (byte_after_cr >= 0)
2325         c = byte_after_cr, byte_after_cr = -1;
2326       else
2327         ONE_MORE_BYTE (c);
2328
2329       if (c < 0 || c == 0x80)
2330         {
2331           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2332           if (c < 0)
2333             {
2334               *charbuf++ = -c;
2335               char_offset++;
2336             }
2337           else
2338             DECODE_EMACS_MULE_COMPOSITION_START ();
2339           continue;
2340         }
2341
2342       if (c < 0x80)
2343         {
2344           if (eol_dos && c == '\r')
2345             ONE_MORE_BYTE (byte_after_cr);
2346           id = charset_ascii;
2347           if (cmp_status->state != COMPOSING_NO)
2348             {
2349               if (cmp_status->old_form)
2350                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2351               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2352                 cmp_status->ncomps--;
2353             }
2354         }
2355       else
2356         {
2357           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2358           /* emacs_mule_char can load a charset map from a file, which
2359              allocates a large structure and might cause buffer text
2360              to be relocated as result.  Thus, we need to remember the
2361              original pointer to buffer text, and fix up all related
2362              pointers after the call.  */
2363           const unsigned char *orig = coding->source;
2364           ptrdiff_t offset;
2365
2366           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2367                                cmp_status);
2368           offset = coding->source - orig;
2369           if (offset)
2370             {
2371               src += offset;
2372               src_base += offset;
2373               src_end += offset;
2374             }
2375           if (c < 0)
2376             {
2377               if (c == -1)
2378                 goto invalid_code;
2379               if (c == -2)
2380                 break;
2381             }
2382           src = src_base + nbytes;
2383           consumed_chars = consumed_chars_base + nchars;
2384           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2385             cmp_status->ncomps -= nchars;
2386         }
2387
2388       /* Now if C >= 0, we found a normally encoded character, if C <
2389          0, we found an old-style composition component character or
2390          rule.  */
2391
2392       if (cmp_status->state == COMPOSING_NO)
2393         {
2394           if (last_id != id)
2395             {
2396               if (last_id != charset_ascii)
2397                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2398                                   last_id);
2399               last_id = id;
2400               last_offset = char_offset;
2401             }
2402           *charbuf++ = c;
2403           char_offset++;
2404         }
2405       else if (cmp_status->state == COMPOSING_CHAR)
2406         {
2407           if (cmp_status->old_form)
2408             {
2409               if (c >= 0)
2410                 {
2411                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2412                   *charbuf++ = c;
2413                   char_offset++;
2414                 }
2415               else
2416                 {
2417                   *charbuf++ = -c;
2418                   cmp_status->nchars++;
2419                   cmp_status->length++;
2420                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2421                     EMACS_MULE_COMPOSITION_END ();
2422                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2423                     cmp_status->state = COMPOSING_RULE;
2424                 }
2425             }
2426           else
2427             {
2428               *charbuf++ = c;
2429               cmp_status->length++;
2430               cmp_status->nchars--;
2431               if (cmp_status->nchars == 0)
2432                 EMACS_MULE_COMPOSITION_END ();
2433             }
2434         }
2435       else if (cmp_status->state == COMPOSING_RULE)
2436         {
2437           int rule;
2438
2439           if (c >= 0)
2440             {
2441               EMACS_MULE_COMPOSITION_END ();
2442               *charbuf++ = c;
2443               char_offset++;
2444             }
2445           else
2446             {
2447               c = -c;
2448               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2449               if (rule < 0)
2450                 goto invalid_code;
2451               *charbuf++ = -2;
2452               *charbuf++ = rule;
2453               cmp_status->length += 2;
2454               cmp_status->state = COMPOSING_CHAR;
2455             }
2456         }
2457       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2458         {
2459           *charbuf++ = c;
2460           cmp_status->length++;
2461           if (cmp_status->ncomps == 0)
2462             cmp_status->state = COMPOSING_CHAR;
2463           else if (cmp_status->ncomps > 0)
2464             {
2465               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2466                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2467             }
2468           else
2469             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2470         }
2471       else                      /* COMPOSING_COMPONENT_RULE */
2472         {
2473           int rule;
2474
2475           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2476           if (rule < 0)
2477             goto invalid_code;
2478           *charbuf++ = -2;
2479           *charbuf++ = rule;
2480           cmp_status->length += 2;
2481           cmp_status->ncomps--;
2482           if (cmp_status->ncomps > 0)
2483             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2484           else
2485             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2486         }
2487       continue;
2488
2489     invalid_code:
2490       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2491       src = src_base;
2492       consumed_chars = consumed_chars_base;
2493       ONE_MORE_BYTE (c);
2494       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2495       char_offset++;
2496       coding->errors++;
2497     }
2498
2499  no_more_source:
2500   if (cmp_status->state != COMPOSING_NO)
2501     {
2502       if (coding->mode & CODING_MODE_LAST_BLOCK)
2503         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2504       else
2505         {
2506           int i;
2507
2508           charbuf -= cmp_status->length;
2509           for (i = 0; i < cmp_status->length; i++)
2510             cmp_status->carryover[i] = charbuf[i];
2511         }
2512     }
2513   if (last_id != charset_ascii)
2514     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2515   coding->consumed_char += consumed_chars_base;
2516   coding->consumed = src_base - coding->source;
2517   coding->charbuf_used = charbuf - coding->charbuf;
2518 }
2519
2520
2521 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2522   do {                                          \
2523     if (id < 0xA0)                              \
2524       codes[0] = id, codes[1] = 0;              \
2525     else if (id < 0xE0)                         \
2526       codes[0] = 0x9A, codes[1] = id;           \
2527     else if (id < 0xF0)                         \
2528       codes[0] = 0x9B, codes[1] = id;           \
2529     else if (id < 0xF5)                         \
2530       codes[0] = 0x9C, codes[1] = id;           \
2531     else                                        \
2532       codes[0] = 0x9D, codes[1] = id;           \
2533   } while (0);
2534
2535
2536 static bool
2537 encode_coding_emacs_mule (struct coding_system *coding)
2538 {
2539   bool multibytep = coding->dst_multibyte;
2540   int *charbuf = coding->charbuf;
2541   int *charbuf_end = charbuf + coding->charbuf_used;
2542   unsigned char *dst = coding->destination + coding->produced;
2543   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2544   int safe_room = 8;
2545   ptrdiff_t produced_chars = 0;
2546   Lisp_Object attrs, charset_list;
2547   int c;
2548   int preferred_charset_id = -1;
2549
2550   CODING_GET_INFO (coding, attrs, charset_list);
2551   if (! EQ (charset_list, Vemacs_mule_charset_list))
2552     {
2553       charset_list = Vemacs_mule_charset_list;
2554       ASET (attrs, coding_attr_charset_list, charset_list);
2555     }
2556
2557   while (charbuf < charbuf_end)
2558     {
2559       ASSURE_DESTINATION (safe_room);
2560       c = *charbuf++;
2561
2562       if (c < 0)
2563         {
2564           /* Handle an annotation.  */
2565           switch (*charbuf)
2566             {
2567             case CODING_ANNOTATE_COMPOSITION_MASK:
2568               /* Not yet implemented.  */
2569               break;
2570             case CODING_ANNOTATE_CHARSET_MASK:
2571               preferred_charset_id = charbuf[3];
2572               if (preferred_charset_id >= 0
2573                   && NILP (Fmemq (make_number (preferred_charset_id),
2574                                   charset_list)))
2575                 preferred_charset_id = -1;
2576               break;
2577             default:
2578               emacs_abort ();
2579             }
2580           charbuf += -c - 1;
2581           continue;
2582         }
2583
2584       if (ASCII_CHAR_P (c))
2585         EMIT_ONE_ASCII_BYTE (c);
2586       else if (CHAR_BYTE8_P (c))
2587         {
2588           c = CHAR_TO_BYTE8 (c);
2589           EMIT_ONE_BYTE (c);
2590         }
2591       else
2592         {
2593           struct charset *charset;
2594           unsigned code;
2595           int dimension;
2596           int emacs_mule_id;
2597           unsigned char leading_codes[2];
2598
2599           if (preferred_charset_id >= 0)
2600             {
2601               bool result;
2602
2603               charset = CHARSET_FROM_ID (preferred_charset_id);
2604               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2605               if (result)
2606                 code = ENCODE_CHAR (charset, c);
2607               else
2608                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2609                                      &code, charset);
2610             }
2611           else
2612             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2613                                  &code, charset);
2614           if (! charset)
2615             {
2616               c = coding->default_char;
2617               if (ASCII_CHAR_P (c))
2618                 {
2619                   EMIT_ONE_ASCII_BYTE (c);
2620                   continue;
2621                 }
2622               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2623                                    &code, charset);
2624             }
2625           dimension = CHARSET_DIMENSION (charset);
2626           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2627           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2628           EMIT_ONE_BYTE (leading_codes[0]);
2629           if (leading_codes[1])
2630             EMIT_ONE_BYTE (leading_codes[1]);
2631           if (dimension == 1)
2632             EMIT_ONE_BYTE (code | 0x80);
2633           else
2634             {
2635               code |= 0x8080;
2636               EMIT_ONE_BYTE (code >> 8);
2637               EMIT_ONE_BYTE (code & 0xFF);
2638             }
2639         }
2640     }
2641   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2642   coding->produced_char += produced_chars;
2643   coding->produced = dst - coding->destination;
2644   return 0;
2645 }
2646
2647 \f
2648 /*** 7. ISO2022 handlers ***/
2649
2650 /* The following note describes the coding system ISO2022 briefly.
2651    Since the intention of this note is to help understand the
2652    functions in this file, some parts are NOT ACCURATE or are OVERLY
2653    SIMPLIFIED.  For thorough understanding, please refer to the
2654    original document of ISO2022.  This is equivalent to the standard
2655    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2656
2657    ISO2022 provides many mechanisms to encode several character sets
2658    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2659    is encoded using bytes less than 128.  This may make the encoded
2660    text a little bit longer, but the text passes more easily through
2661    several types of gateway, some of which strip off the MSB (Most
2662    Significant Bit).
2663
2664    There are two kinds of character sets: control character sets and
2665    graphic character sets.  The former contain control characters such
2666    as `newline' and `escape' to provide control functions (control
2667    functions are also provided by escape sequences).  The latter
2668    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2669    two control character sets and many graphic character sets.
2670
2671    Graphic character sets are classified into one of the following
2672    four classes, according to the number of bytes (DIMENSION) and
2673    number of characters in one dimension (CHARS) of the set:
2674    - DIMENSION1_CHARS94
2675    - DIMENSION1_CHARS96
2676    - DIMENSION2_CHARS94
2677    - DIMENSION2_CHARS96
2678
2679    In addition, each character set is assigned an identification tag,
2680    unique for each set, called the "final character" (denoted as <F>
2681    hereafter).  The <F> of each character set is decided by ECMA(*)
2682    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2683    (0x30..0x3F are for private use only).
2684
2685    Note (*): ECMA = European Computer Manufacturers Association
2686
2687    Here are examples of graphic character sets [NAME(<F>)]:
2688         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2689         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2690         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2691         o DIMENSION2_CHARS96 -- none for the moment
2692
2693    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2694         C0 [0x00..0x1F] -- control character plane 0
2695         GL [0x20..0x7F] -- graphic character plane 0
2696         C1 [0x80..0x9F] -- control character plane 1
2697         GR [0xA0..0xFF] -- graphic character plane 1
2698
2699    A control character set is directly designated and invoked to C0 or
2700    C1 by an escape sequence.  The most common case is that:
2701    - ISO646's  control character set is designated/invoked to C0, and
2702    - ISO6429's control character set is designated/invoked to C1,
2703    and usually these designations/invocations are omitted in encoded
2704    text.  In a 7-bit environment, only C0 can be used, and a control
2705    character for C1 is encoded by an appropriate escape sequence to
2706    fit into the environment.  All control characters for C1 are
2707    defined to have corresponding escape sequences.
2708
2709    A graphic character set is at first designated to one of four
2710    graphic registers (G0 through G3), then these graphic registers are
2711    invoked to GL or GR.  These designations and invocations can be
2712    done independently.  The most common case is that G0 is invoked to
2713    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2714    these invocations and designations are omitted in encoded text.
2715    In a 7-bit environment, only GL can be used.
2716
2717    When a graphic character set of CHARS94 is invoked to GL, codes
2718    0x20 and 0x7F of the GL area work as control characters SPACE and
2719    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2720    be used.
2721
2722    There are two ways of invocation: locking-shift and single-shift.
2723    With locking-shift, the invocation lasts until the next different
2724    invocation, whereas with single-shift, the invocation affects the
2725    following character only and doesn't affect the locking-shift
2726    state.  Invocations are done by the following control characters or
2727    escape sequences:
2728
2729    ----------------------------------------------------------------------
2730    abbrev  function                  cntrl escape seq   description
2731    ----------------------------------------------------------------------
2732    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2733    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2734    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2735    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2736    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2737    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2738    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2739    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2740    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2741    ----------------------------------------------------------------------
2742    (*) These are not used by any known coding system.
2743
2744    Control characters for these functions are defined by macros
2745    ISO_CODE_XXX in `coding.h'.
2746
2747    Designations are done by the following escape sequences:
2748    ----------------------------------------------------------------------
2749    escape sequence      description
2750    ----------------------------------------------------------------------
2751    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2752    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2753    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2754    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2755    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2756    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2757    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2758    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2759    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2760    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2761    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2762    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2763    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2764    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2765    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2766    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2767    ----------------------------------------------------------------------
2768
2769    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2770    of dimension 1, chars 94, and final character <F>, etc...
2771
2772    Note (*): Although these designations are not allowed in ISO2022,
2773    Emacs accepts them on decoding, and produces them on encoding
2774    CHARS96 character sets in a coding system which is characterized as
2775    7-bit environment, non-locking-shift, and non-single-shift.
2776
2777    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2778    '(' must be omitted.  We refer to this as "short-form" hereafter.
2779
2780    Now you may notice that there are a lot of ways of encoding the
2781    same multilingual text in ISO2022.  Actually, there exist many
2782    coding systems such as Compound Text (used in X11's inter client
2783    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2784    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2785    localized platforms), and all of these are variants of ISO2022.
2786
2787    In addition to the above, Emacs handles two more kinds of escape
2788    sequences: ISO6429's direction specification and Emacs' private
2789    sequence for specifying character composition.
2790
2791    ISO6429's direction specification takes the following form:
2792         o CSI ']'      -- end of the current direction
2793         o CSI '0' ']'  -- end of the current direction
2794         o CSI '1' ']'  -- start of left-to-right text
2795         o CSI '2' ']'  -- start of right-to-left text
2796    The control character CSI (0x9B: control sequence introducer) is
2797    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2798
2799    Character composition specification takes the following form:
2800         o ESC '0' -- start relative composition
2801         o ESC '1' -- end composition
2802         o ESC '2' -- start rule-base composition (*)
2803         o ESC '3' -- start relative composition with alternate chars  (**)
2804         o ESC '4' -- start rule-base composition with alternate chars  (**)
2805   Since these are not standard escape sequences of any ISO standard,
2806   the use of them with these meanings is restricted to Emacs only.
2807
2808   (*) This form is used only in Emacs 20.7 and older versions,
2809   but newer versions can safely decode it.
2810   (**) This form is used only in Emacs 21.1 and newer versions,
2811   and older versions can't decode it.
2812
2813   Here's a list of example usages of these composition escape
2814   sequences (categorized by `enum composition_method').
2815
2816   COMPOSITION_RELATIVE:
2817         ESC 0 CHAR [ CHAR ] ESC 1
2818   COMPOSITION_WITH_RULE:
2819         ESC 2 CHAR [ RULE CHAR ] ESC 1
2820   COMPOSITION_WITH_ALTCHARS:
2821         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2822   COMPOSITION_WITH_RULE_ALTCHARS:
2823         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2824
2825 static enum iso_code_class_type iso_code_class[256];
2826
2827 #define SAFE_CHARSET_P(coding, id)      \
2828   ((id) <= (coding)->max_charset_id     \
2829    && (coding)->safe_charsets[id] != 255)
2830
2831 static void
2832 setup_iso_safe_charsets (Lisp_Object attrs)
2833 {
2834   Lisp_Object charset_list, safe_charsets;
2835   Lisp_Object request;
2836   Lisp_Object reg_usage;
2837   Lisp_Object tail;
2838   EMACS_INT reg94, reg96;
2839   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2840   int max_charset_id;
2841
2842   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2843   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2844       && ! EQ (charset_list, Viso_2022_charset_list))
2845     {
2846       charset_list = Viso_2022_charset_list;
2847       ASET (attrs, coding_attr_charset_list, charset_list);
2848       ASET (attrs, coding_attr_safe_charsets, Qnil);
2849     }
2850
2851   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2852     return;
2853
2854   max_charset_id = 0;
2855   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2856     {
2857       int id = XINT (XCAR (tail));
2858       if (max_charset_id < id)
2859         max_charset_id = id;
2860     }
2861
2862   safe_charsets = make_uninit_string (max_charset_id + 1);
2863   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2864   request = AREF (attrs, coding_attr_iso_request);
2865   reg_usage = AREF (attrs, coding_attr_iso_usage);
2866   reg94 = XINT (XCAR (reg_usage));
2867   reg96 = XINT (XCDR (reg_usage));
2868
2869   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2870     {
2871       Lisp_Object id;
2872       Lisp_Object reg;
2873       struct charset *charset;
2874
2875       id = XCAR (tail);
2876       charset = CHARSET_FROM_ID (XINT (id));
2877       reg = Fcdr (Fassq (id, request));
2878       if (! NILP (reg))
2879         SSET (safe_charsets, XINT (id), XINT (reg));
2880       else if (charset->iso_chars_96)
2881         {
2882           if (reg96 < 4)
2883             SSET (safe_charsets, XINT (id), reg96);
2884         }
2885       else
2886         {
2887           if (reg94 < 4)
2888             SSET (safe_charsets, XINT (id), reg94);
2889         }
2890     }
2891   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2892 }
2893
2894
2895 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2896    Return true if a text is encoded in one of ISO-2022 based coding
2897    systems.  */
2898
2899 static bool
2900 detect_coding_iso_2022 (struct coding_system *coding,
2901                         struct coding_detection_info *detect_info)
2902 {
2903   const unsigned char *src = coding->source, *src_base = src;
2904   const unsigned char *src_end = coding->source + coding->src_bytes;
2905   bool multibytep = coding->src_multibyte;
2906   bool single_shifting = 0;
2907   int id;
2908   int c, c1;
2909   ptrdiff_t consumed_chars = 0;
2910   int i;
2911   int rejected = 0;
2912   int found = 0;
2913   int composition_count = -1;
2914
2915   detect_info->checked |= CATEGORY_MASK_ISO;
2916
2917   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2918     {
2919       struct coding_system *this = &(coding_categories[i]);
2920       Lisp_Object attrs, val;
2921
2922       if (this->id < 0)
2923         continue;
2924       attrs = CODING_ID_ATTRS (this->id);
2925       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2926           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2927         setup_iso_safe_charsets (attrs);
2928       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2929       this->max_charset_id = SCHARS (val) - 1;
2930       this->safe_charsets = SDATA (val);
2931     }
2932
2933   /* A coding system of this category is always ASCII compatible.  */
2934   src += coding->head_ascii;
2935
2936   while (rejected != CATEGORY_MASK_ISO)
2937     {
2938       src_base = src;
2939       ONE_MORE_BYTE (c);
2940       switch (c)
2941         {
2942         case ISO_CODE_ESC:
2943           if (inhibit_iso_escape_detection)
2944             break;
2945           single_shifting = 0;
2946           ONE_MORE_BYTE (c);
2947           if (c == 'N' || c == 'O')
2948             {
2949               /* ESC <Fe> for SS2 or SS3.  */
2950               single_shifting = 1;
2951               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2952             }
2953           else if (c == '1')
2954             {
2955               /* End of composition.  */
2956               if (composition_count < 0
2957                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2958                 /* Invalid */
2959                 break;
2960               composition_count = -1;
2961               found |= CATEGORY_MASK_ISO;
2962             }
2963           else if (c >= '0' && c <= '4')
2964             {
2965               /* ESC <Fp> for start/end composition.  */
2966               composition_count = 0;
2967             }
2968           else
2969             {
2970               if (c >= '(' && c <= '/')
2971                 {
2972                   /* Designation sequence for a charset of dimension 1.  */
2973                   ONE_MORE_BYTE (c1);
2974                   if (c1 < ' ' || c1 >= 0x80
2975                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2976                     /* Invalid designation sequence.  Just ignore.  */
2977                     break;
2978                 }
2979               else if (c == '$')
2980                 {
2981                   /* Designation sequence for a charset of dimension 2.  */
2982                   ONE_MORE_BYTE (c);
2983                   if (c >= '@' && c <= 'B')
2984                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2985                     id = iso_charset_table[1][0][c];
2986                   else if (c >= '(' && c <= '/')
2987                     {
2988                       ONE_MORE_BYTE (c1);
2989                       if (c1 < ' ' || c1 >= 0x80
2990                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2991                         /* Invalid designation sequence.  Just ignore.  */
2992                         break;
2993                     }
2994                   else
2995                     /* Invalid designation sequence.  Just ignore it.  */
2996                     break;
2997                 }
2998               else
2999                 {
3000                   /* Invalid escape sequence.  Just ignore it.  */
3001                   break;
3002                 }
3003
3004               /* We found a valid designation sequence for CHARSET.  */
3005               rejected |= CATEGORY_MASK_ISO_8BIT;
3006               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3007                                   id))
3008                 found |= CATEGORY_MASK_ISO_7;
3009               else
3010                 rejected |= CATEGORY_MASK_ISO_7;
3011               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3012                                   id))
3013                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3014               else
3015                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3016               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3017                                   id))
3018                 found |= CATEGORY_MASK_ISO_7_ELSE;
3019               else
3020                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3021               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3022                                   id))
3023                 found |= CATEGORY_MASK_ISO_8_ELSE;
3024               else
3025                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3026             }
3027           break;
3028
3029         case ISO_CODE_SO:
3030         case ISO_CODE_SI:
3031           /* Locking shift out/in.  */
3032           if (inhibit_iso_escape_detection)
3033             break;
3034           single_shifting = 0;
3035           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3036           break;
3037
3038         case ISO_CODE_CSI:
3039           /* Control sequence introducer.  */
3040           single_shifting = 0;
3041           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3042           found |= CATEGORY_MASK_ISO_8_ELSE;
3043           goto check_extra_latin;
3044
3045         case ISO_CODE_SS2:
3046         case ISO_CODE_SS3:
3047           /* Single shift.   */
3048           if (inhibit_iso_escape_detection)
3049             break;
3050           single_shifting = 0;
3051           rejected |= CATEGORY_MASK_ISO_7BIT;
3052           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3053               & CODING_ISO_FLAG_SINGLE_SHIFT)
3054             {
3055               found |= CATEGORY_MASK_ISO_8_1;
3056               single_shifting = 1;
3057             }
3058           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3059               & CODING_ISO_FLAG_SINGLE_SHIFT)
3060             {
3061               found |= CATEGORY_MASK_ISO_8_2;
3062               single_shifting = 1;
3063             }
3064           if (single_shifting)
3065             break;
3066         check_extra_latin:
3067           if (! VECTORP (Vlatin_extra_code_table)
3068               || NILP (AREF (Vlatin_extra_code_table, c)))
3069             {
3070               rejected = CATEGORY_MASK_ISO;
3071               break;
3072             }
3073           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3074               & CODING_ISO_FLAG_LATIN_EXTRA)
3075             found |= CATEGORY_MASK_ISO_8_1;
3076           else
3077             rejected |= CATEGORY_MASK_ISO_8_1;
3078           rejected |= CATEGORY_MASK_ISO_8_2;
3079           break;
3080
3081         default:
3082           if (c < 0)
3083             continue;
3084           if (c < 0x80)
3085             {
3086               if (composition_count >= 0)
3087                 composition_count++;
3088               single_shifting = 0;
3089               break;
3090             }
3091           if (c >= 0xA0)
3092             {
3093               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3094               found |= CATEGORY_MASK_ISO_8_1;
3095               /* Check the length of succeeding codes of the range
3096                  0xA0..0FF.  If the byte length is even, we include
3097                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3098                  only when we are not single shifting.  */
3099               if (! single_shifting
3100                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3101                 {
3102                   int len = 1;
3103                   while (src < src_end)
3104                     {
3105                       src_base = src;
3106                       ONE_MORE_BYTE (c);
3107                       if (c < 0xA0)
3108                         {
3109                           src = src_base;
3110                           break;
3111                         }
3112                       len++;
3113                     }
3114
3115                   if (len & 1 && src < src_end)
3116                     {
3117                       rejected |= CATEGORY_MASK_ISO_8_2;
3118                       if (composition_count >= 0)
3119                         composition_count += len;
3120                     }
3121                   else
3122                     {
3123                       found |= CATEGORY_MASK_ISO_8_2;
3124                       if (composition_count >= 0)
3125                         composition_count += len / 2;
3126                     }
3127                 }
3128               break;
3129             }
3130         }
3131     }
3132   detect_info->rejected |= CATEGORY_MASK_ISO;
3133   return 0;
3134
3135  no_more_source:
3136   detect_info->rejected |= rejected;
3137   detect_info->found |= (found & ~rejected);
3138   return 1;
3139 }
3140
3141
3142 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3143    escape sequence should be kept.  */
3144 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3145   do {                                                                  \
3146     int id, prev;                                                       \
3147                                                                         \
3148     if (final < '0' || final >= 128                                     \
3149         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3150         || !SAFE_CHARSET_P (coding, id))                                \
3151       {                                                                 \
3152         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3153         chars_96 = -1;                                                  \
3154         break;                                                          \
3155       }                                                                 \
3156     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3157     if (id == charset_jisx0201_roman)                                   \
3158       {                                                                 \
3159         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3160           id = charset_ascii;                                           \
3161       }                                                                 \
3162     else if (id == charset_jisx0208_1978)                               \
3163       {                                                                 \
3164         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3165           id = charset_jisx0208;                                        \
3166       }                                                                 \
3167     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3168     /* If there was an invalid designation to REG previously, and this  \
3169        designation is ASCII to REG, we should keep this designation     \
3170        sequence.  */                                                    \
3171     if (prev == -2 && id == charset_ascii)                              \
3172       chars_96 = -1;                                                    \
3173   } while (0)
3174
3175
3176 /* Handle these composition sequence (ALT: alternate char):
3177
3178    (1) relative composition: ESC 0 CHAR ... ESC 1
3179    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3180    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3181    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3182
3183    When the start sequence (ESC 0/2/3/4) is found, this annotation
3184    header is produced.
3185
3186         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3187
3188    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3189    produced until the end sequence (ESC 1) is found:
3190
3191    (1) CHAR ... CHAR
3192    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3193    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3194    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3195
3196    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3197    annotation header is updated as below:
3198
3199    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3200    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3201    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3202    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3203
3204    If an error is found while composing, the annotation header is
3205    changed to:
3206
3207         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3208
3209    and the sequence [ -2 DECODED-RULE ] is changed to the original
3210    byte sequence as below:
3211         o the original byte sequence is B: [ B -1 ]
3212         o the original byte sequence is B1 B2: [ B1 B2 ]
3213    and the sequence [ -1 -1 ] is changed to the original byte
3214    sequence:
3215         [ ESC '0' ]
3216 */
3217
3218 /* Decode a composition rule C1 and maybe one more byte from the
3219    source, and set RULE to the encoded composition rule.  If the rule
3220    is invalid, goto invalid_code.  */
3221
3222 #define DECODE_COMPOSITION_RULE(rule)                                   \
3223   do {                                                                  \
3224     rule = c1 - 32;                                                     \
3225     if (rule < 0)                                                       \
3226       goto invalid_code;                                                \
3227     if (rule < 81)              /* old format (before ver.21) */        \
3228       {                                                                 \
3229         int gref = (rule) / 9;                                          \
3230         int nref = (rule) % 9;                                          \
3231         if (gref == 4) gref = 10;                                       \
3232         if (nref == 4) nref = 10;                                       \
3233         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3234       }                                                                 \
3235     else                        /* new format (after ver.21) */         \
3236       {                                                                 \
3237         int b;                                                          \
3238                                                                         \
3239         ONE_MORE_BYTE (b);                                              \
3240         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3241           goto invalid_code;                                            \
3242         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3243         rule += 0x100;   /* Distinguish it from the old format.  */     \
3244       }                                                                 \
3245   } while (0)
3246
3247 #define ENCODE_COMPOSITION_RULE(rule)                           \
3248   do {                                                          \
3249     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3250                                                                 \
3251     if (rule < 0x100)           /* old format */                \
3252       {                                                         \
3253         if (gref == 10) gref = 4;                               \
3254         if (nref == 10) nref = 4;                               \
3255         charbuf[idx] = 32 + gref * 9 + nref;                    \
3256         charbuf[idx + 1] = -1;                                  \
3257         new_chars++;                                            \
3258       }                                                         \
3259     else                                /* new format */        \
3260       {                                                         \
3261         charbuf[idx] = 32 + 81 + gref;                          \
3262         charbuf[idx + 1] = 32 + nref;                           \
3263         new_chars += 2;                                         \
3264       }                                                         \
3265   } while (0)
3266
3267 /* Finish the current composition as invalid.  */
3268
3269 static int
3270 finish_composition (int *charbuf, struct composition_status *cmp_status)
3271 {
3272   int idx = - cmp_status->length;
3273   int new_chars;
3274
3275   /* Recover the original ESC sequence */
3276   charbuf[idx++] = ISO_CODE_ESC;
3277   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3278                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3279                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3280                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3281                     : '4');
3282   charbuf[idx++] = -2;
3283   charbuf[idx++] = 0;
3284   charbuf[idx++] = -1;
3285   new_chars = cmp_status->nchars;
3286   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3287     for (; idx < 0; idx++)
3288       {
3289         int elt = charbuf[idx];
3290
3291         if (elt == -2)
3292           {
3293             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3294             idx++;
3295           }
3296         else if (elt == -1)
3297           {
3298             charbuf[idx++] = ISO_CODE_ESC;
3299             charbuf[idx] = '0';
3300             new_chars += 2;
3301           }
3302       }
3303   cmp_status->state = COMPOSING_NO;
3304   return new_chars;
3305 }
3306
3307 /* If characters are under composition, finish the composition.  */
3308 #define MAYBE_FINISH_COMPOSITION()                              \
3309   do {                                                          \
3310     if (cmp_status->state != COMPOSING_NO)                      \
3311       char_offset += finish_composition (charbuf, cmp_status);  \
3312   } while (0)
3313
3314 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3315
3316    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3317    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3318    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3319    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3320
3321    Produce this annotation sequence now:
3322
3323    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3324 */
3325
3326 #define DECODE_COMPOSITION_START(c1)                                       \
3327   do {                                                                     \
3328     if (c1 == '0'                                                          \
3329         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3330              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3331             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3332                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3333       {                                                                    \
3334         *charbuf++ = -1;                                                   \
3335         *charbuf++= -1;                                                    \
3336         cmp_status->state = COMPOSING_CHAR;                                \
3337         cmp_status->length += 2;                                           \
3338       }                                                                    \
3339     else                                                                   \
3340       {                                                                    \
3341         MAYBE_FINISH_COMPOSITION ();                                       \
3342         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3343                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3344                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3345                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3346         cmp_status->state                                                  \
3347           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3348         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3349         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3350         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3351         coding->annotated = 1;                                             \
3352       }                                                                    \
3353   } while (0)
3354
3355
3356 /* Handle composition end sequence ESC 1.  */
3357
3358 #define DECODE_COMPOSITION_END()                                        \
3359   do {                                                                  \
3360     if (cmp_status->nchars == 0                                         \
3361         || ((cmp_status->state == COMPOSING_CHAR)                       \
3362             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3363       {                                                                 \
3364         MAYBE_FINISH_COMPOSITION ();                                    \
3365         goto invalid_code;                                              \
3366       }                                                                 \
3367     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3368       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3369     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3370       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3371     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3372     char_offset += cmp_status->nchars;                                  \
3373     cmp_status->state = COMPOSING_NO;                                   \
3374   } while (0)
3375
3376 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3377
3378 #define STORE_COMPOSITION_RULE(rule)    \
3379   do {                                  \
3380     *charbuf++ = -2;                    \
3381     *charbuf++ = rule;                  \
3382     cmp_status->length += 2;            \
3383     cmp_status->state--;                \
3384   } while (0)
3385
3386 /* Store a composed char or a component char C in charbuf, and update
3387    cmp_status.  */
3388
3389 #define STORE_COMPOSITION_CHAR(c)                                       \
3390   do {                                                                  \
3391     *charbuf++ = (c);                                                   \
3392     cmp_status->length++;                                               \
3393     if (cmp_status->state == COMPOSING_CHAR)                            \
3394       cmp_status->nchars++;                                             \
3395     else                                                                \
3396       cmp_status->ncomps++;                                             \
3397     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3398         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3399             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3400       cmp_status->state++;                                              \
3401   } while (0)
3402
3403
3404 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3405
3406 static void
3407 decode_coding_iso_2022 (struct coding_system *coding)
3408 {
3409   const unsigned char *src = coding->source + coding->consumed;
3410   const unsigned char *src_end = coding->source + coding->src_bytes;
3411   const unsigned char *src_base;
3412   int *charbuf = coding->charbuf + coding->charbuf_used;
3413   /* We may produce two annotations (charset and composition) in one
3414      loop and one more charset annotation at the end.  */
3415   int *charbuf_end
3416     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3417   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3418   bool multibytep = coding->src_multibyte;
3419   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3420   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3421   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3422   int charset_id_2, charset_id_3;
3423   struct charset *charset;
3424   int c;
3425   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3426   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3427   ptrdiff_t char_offset = coding->produced_char;
3428   ptrdiff_t last_offset = char_offset;
3429   int last_id = charset_ascii;
3430   bool eol_dos
3431     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3432   int byte_after_cr = -1;
3433   int i;
3434
3435   setup_iso_safe_charsets (attrs);
3436   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3437
3438   if (cmp_status->state != COMPOSING_NO)
3439     {
3440       if (charbuf_end - charbuf < cmp_status->length)
3441         emacs_abort ();
3442       for (i = 0; i < cmp_status->length; i++)
3443         *charbuf++ = cmp_status->carryover[i];
3444       coding->annotated = 1;
3445     }
3446
3447   while (1)
3448     {
3449       int c1, c2, c3;
3450
3451       src_base = src;
3452       consumed_chars_base = consumed_chars;
3453
3454       if (charbuf >= charbuf_end)
3455         {
3456           if (byte_after_cr >= 0)
3457             src_base--;
3458           break;
3459         }
3460
3461       if (byte_after_cr >= 0)
3462         c1 = byte_after_cr, byte_after_cr = -1;
3463       else
3464         ONE_MORE_BYTE (c1);
3465       if (c1 < 0)
3466         goto invalid_code;
3467
3468       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3469         {
3470           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3471           char_offset++;
3472           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3473           continue;
3474         }
3475
3476       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3477         {
3478           if (c1 == ISO_CODE_ESC)
3479             {
3480               if (src + 1 >= src_end)
3481                 goto no_more_source;
3482               *charbuf++ = ISO_CODE_ESC;
3483               char_offset++;
3484               if (src[0] == '%' && src[1] == '@')
3485                 {
3486                   src += 2;
3487                   consumed_chars += 2;
3488                   char_offset += 2;
3489                   /* We are sure charbuf can contain two more chars. */
3490                   *charbuf++ = '%';
3491                   *charbuf++ = '@';
3492                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3493                 }
3494             }
3495           else
3496             {
3497               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3498               char_offset++;
3499             }
3500           continue;
3501         }
3502
3503       if ((cmp_status->state == COMPOSING_RULE
3504            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3505           && c1 != ISO_CODE_ESC)
3506         {
3507           int rule;
3508
3509           DECODE_COMPOSITION_RULE (rule);
3510           STORE_COMPOSITION_RULE (rule);
3511           continue;
3512         }
3513
3514       /* We produce at most one character.  */
3515       switch (iso_code_class [c1])
3516         {
3517         case ISO_0x20_or_0x7F:
3518           if (charset_id_0 < 0
3519               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3520             /* This is SPACE or DEL.  */
3521             charset = CHARSET_FROM_ID (charset_ascii);
3522           else
3523             charset = CHARSET_FROM_ID (charset_id_0);
3524           break;
3525
3526         case ISO_graphic_plane_0:
3527           if (charset_id_0 < 0)
3528             charset = CHARSET_FROM_ID (charset_ascii);
3529           else
3530             charset = CHARSET_FROM_ID (charset_id_0);
3531           break;
3532
3533         case ISO_0xA0_or_0xFF:
3534           if (charset_id_1 < 0
3535               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3536               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3537             goto invalid_code;
3538           /* This is a graphic character, we fall down ... */
3539
3540         case ISO_graphic_plane_1:
3541           if (charset_id_1 < 0)
3542             goto invalid_code;
3543           charset = CHARSET_FROM_ID (charset_id_1);
3544           break;
3545
3546         case ISO_control_0:
3547           if (eol_dos && c1 == '\r')
3548             ONE_MORE_BYTE (byte_after_cr);
3549           MAYBE_FINISH_COMPOSITION ();
3550           charset = CHARSET_FROM_ID (charset_ascii);
3551           break;
3552
3553         case ISO_control_1:
3554           goto invalid_code;
3555
3556         case ISO_shift_out:
3557           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3558               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3559             goto invalid_code;
3560           CODING_ISO_INVOCATION (coding, 0) = 1;
3561           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3562           continue;
3563
3564         case ISO_shift_in:
3565           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3566             goto invalid_code;
3567           CODING_ISO_INVOCATION (coding, 0) = 0;
3568           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3569           continue;
3570
3571         case ISO_single_shift_2_7:
3572           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3573             goto invalid_code;
3574         case ISO_single_shift_2:
3575           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3576             goto invalid_code;
3577           /* SS2 is handled as an escape sequence of ESC 'N' */
3578           c1 = 'N';
3579           goto label_escape_sequence;
3580
3581         case ISO_single_shift_3:
3582           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3583             goto invalid_code;
3584           /* SS2 is handled as an escape sequence of ESC 'O' */
3585           c1 = 'O';
3586           goto label_escape_sequence;
3587
3588         case ISO_control_sequence_introducer:
3589           /* CSI is handled as an escape sequence of ESC '[' ...  */
3590           c1 = '[';
3591           goto label_escape_sequence;
3592
3593         case ISO_escape:
3594           ONE_MORE_BYTE (c1);
3595         label_escape_sequence:
3596           /* Escape sequences handled here are invocation,
3597              designation, direction specification, and character
3598              composition specification.  */
3599           switch (c1)
3600             {
3601             case '&':           /* revision of following character set */
3602               ONE_MORE_BYTE (c1);
3603               if (!(c1 >= '@' && c1 <= '~'))
3604                 goto invalid_code;
3605               ONE_MORE_BYTE (c1);
3606               if (c1 != ISO_CODE_ESC)
3607                 goto invalid_code;
3608               ONE_MORE_BYTE (c1);
3609               goto label_escape_sequence;
3610
3611             case '$':           /* designation of 2-byte character set */
3612               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3613                 goto invalid_code;
3614               {
3615                 int reg, chars96;
3616
3617                 ONE_MORE_BYTE (c1);
3618                 if (c1 >= '@' && c1 <= 'B')
3619                   {     /* designation of JISX0208.1978, GB2312.1980,
3620                            or JISX0208.1980 */
3621                     reg = 0, chars96 = 0;
3622                   }
3623                 else if (c1 >= 0x28 && c1 <= 0x2B)
3624                   { /* designation of DIMENSION2_CHARS94 character set */
3625                     reg = c1 - 0x28, chars96 = 0;
3626                     ONE_MORE_BYTE (c1);
3627                   }
3628                 else if (c1 >= 0x2C && c1 <= 0x2F)
3629                   { /* designation of DIMENSION2_CHARS96 character set */
3630                     reg = c1 - 0x2C, chars96 = 1;
3631                     ONE_MORE_BYTE (c1);
3632                   }
3633                 else
3634                   goto invalid_code;
3635                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3636                 /* We must update these variables now.  */
3637                 if (reg == 0)
3638                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3639                 else if (reg == 1)
3640                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3641                 if (chars96 < 0)
3642                   goto invalid_code;
3643               }
3644               continue;
3645
3646             case 'n':           /* invocation of locking-shift-2 */
3647               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3648                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3649                 goto invalid_code;
3650               CODING_ISO_INVOCATION (coding, 0) = 2;
3651               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3652               continue;
3653
3654             case 'o':           /* invocation of locking-shift-3 */
3655               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3656                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3657                 goto invalid_code;
3658               CODING_ISO_INVOCATION (coding, 0) = 3;
3659               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3660               continue;
3661
3662             case 'N':           /* invocation of single-shift-2 */
3663               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3664                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3665                 goto invalid_code;
3666               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3667               if (charset_id_2 < 0)
3668                 charset = CHARSET_FROM_ID (charset_ascii);
3669               else
3670                 charset = CHARSET_FROM_ID (charset_id_2);
3671               ONE_MORE_BYTE (c1);
3672               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3673                 goto invalid_code;
3674               break;
3675
3676             case 'O':           /* invocation of single-shift-3 */
3677               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3678                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3679                 goto invalid_code;
3680               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3681               if (charset_id_3 < 0)
3682                 charset = CHARSET_FROM_ID (charset_ascii);
3683               else
3684                 charset = CHARSET_FROM_ID (charset_id_3);
3685               ONE_MORE_BYTE (c1);
3686               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3687                 goto invalid_code;
3688               break;
3689
3690             case '0': case '2': case '3': case '4': /* start composition */
3691               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3692                 goto invalid_code;
3693               if (last_id != charset_ascii)
3694                 {
3695                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3696                   last_id = charset_ascii;
3697                   last_offset = char_offset;
3698                 }
3699               DECODE_COMPOSITION_START (c1);
3700               continue;
3701
3702             case '1':           /* end composition */
3703               if (cmp_status->state == COMPOSING_NO)
3704                 goto invalid_code;
3705               DECODE_COMPOSITION_END ();
3706               continue;
3707
3708             case '[':           /* specification of direction */
3709               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3710                 goto invalid_code;
3711               /* For the moment, nested direction is not supported.
3712                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3713                  left-to-right, and nonzero means right-to-left.  */
3714               ONE_MORE_BYTE (c1);
3715               switch (c1)
3716                 {
3717                 case ']':       /* end of the current direction */
3718                   coding->mode &= ~CODING_MODE_DIRECTION;
3719
3720                 case '0':       /* end of the current direction */
3721                 case '1':       /* start of left-to-right direction */
3722                   ONE_MORE_BYTE (c1);
3723                   if (c1 == ']')
3724                     coding->mode &= ~CODING_MODE_DIRECTION;
3725                   else
3726                     goto invalid_code;
3727                   break;
3728
3729                 case '2':       /* start of right-to-left direction */
3730                   ONE_MORE_BYTE (c1);
3731                   if (c1 == ']')
3732                     coding->mode |= CODING_MODE_DIRECTION;
3733                   else
3734                     goto invalid_code;
3735                   break;
3736
3737                 default:
3738                   goto invalid_code;
3739                 }
3740               continue;
3741
3742             case '%':
3743               ONE_MORE_BYTE (c1);
3744               if (c1 == '/')
3745                 {
3746                   /* CTEXT extended segment:
3747                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3748                      We keep these bytes as is for the moment.
3749                      They may be decoded by post-read-conversion.  */
3750                   int dim, M, L;
3751                   int size;
3752
3753                   ONE_MORE_BYTE (dim);
3754                   if (dim < '0' || dim > '4')
3755                     goto invalid_code;
3756                   ONE_MORE_BYTE (M);
3757                   if (M < 128)
3758                     goto invalid_code;
3759                   ONE_MORE_BYTE (L);
3760                   if (L < 128)
3761                     goto invalid_code;
3762                   size = ((M - 128) * 128) + (L - 128);
3763                   if (charbuf + 6 > charbuf_end)
3764                     goto break_loop;
3765                   *charbuf++ = ISO_CODE_ESC;
3766                   *charbuf++ = '%';
3767                   *charbuf++ = '/';
3768                   *charbuf++ = dim;
3769                   *charbuf++ = BYTE8_TO_CHAR (M);
3770                   *charbuf++ = BYTE8_TO_CHAR (L);
3771                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3772                 }
3773               else if (c1 == 'G')
3774                 {
3775                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3776                      ESC % G --UTF-8-BYTES-- ESC % @
3777                      We keep these bytes as is for the moment.
3778                      They may be decoded by post-read-conversion.  */
3779                   if (charbuf + 3 > charbuf_end)
3780                     goto break_loop;
3781                   *charbuf++ = ISO_CODE_ESC;
3782                   *charbuf++ = '%';
3783                   *charbuf++ = 'G';
3784                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3785                 }
3786               else
3787                 goto invalid_code;
3788               continue;
3789               break;
3790
3791             default:
3792               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3793                 goto invalid_code;
3794               {
3795                 int reg, chars96;
3796
3797                 if (c1 >= 0x28 && c1 <= 0x2B)
3798                   { /* designation of DIMENSION1_CHARS94 character set */
3799                     reg = c1 - 0x28, chars96 = 0;
3800                     ONE_MORE_BYTE (c1);
3801                   }
3802                 else if (c1 >= 0x2C && c1 <= 0x2F)
3803                   { /* designation of DIMENSION1_CHARS96 character set */
3804                     reg = c1 - 0x2C, chars96 = 1;
3805                     ONE_MORE_BYTE (c1);
3806                   }
3807                 else
3808                   goto invalid_code;
3809                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3810                 /* We must update these variables now.  */
3811                 if (reg == 0)
3812                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3813                 else if (reg == 1)
3814                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3815                 if (chars96 < 0)
3816                   goto invalid_code;
3817               }
3818               continue;
3819             }
3820           break;
3821
3822         default:
3823           emacs_abort ();
3824         }
3825
3826       if (cmp_status->state == COMPOSING_NO
3827           && charset->id != charset_ascii
3828           && last_id != charset->id)
3829         {
3830           if (last_id != charset_ascii)
3831             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3832           last_id = charset->id;
3833           last_offset = char_offset;
3834         }
3835
3836       /* Now we know CHARSET and 1st position code C1 of a character.
3837          Produce a decoded character while getting 2nd and 3rd
3838          position codes C2, C3 if necessary.  */
3839       if (CHARSET_DIMENSION (charset) > 1)
3840         {
3841           ONE_MORE_BYTE (c2);
3842           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3843               || ((c1 & 0x80) != (c2 & 0x80)))
3844             /* C2 is not in a valid range.  */
3845             goto invalid_code;
3846           if (CHARSET_DIMENSION (charset) == 2)
3847             c1 = (c1 << 8) | c2;
3848           else
3849             {
3850               ONE_MORE_BYTE (c3);
3851               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3852                   || ((c1 & 0x80) != (c3 & 0x80)))
3853                 /* C3 is not in a valid range.  */
3854                 goto invalid_code;
3855               c1 = (c1 << 16) | (c2 << 8) | c2;
3856             }
3857         }
3858       c1 &= 0x7F7F7F;
3859       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3860       if (c < 0)
3861         {
3862           MAYBE_FINISH_COMPOSITION ();
3863           for (; src_base < src; src_base++, char_offset++)
3864             {
3865               if (ASCII_BYTE_P (*src_base))
3866                 *charbuf++ = *src_base;
3867               else
3868                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3869             }
3870         }
3871       else if (cmp_status->state == COMPOSING_NO)
3872         {
3873           *charbuf++ = c;
3874           char_offset++;
3875         }
3876       else if ((cmp_status->state == COMPOSING_CHAR
3877                 ? cmp_status->nchars
3878                 : cmp_status->ncomps)
3879                >= MAX_COMPOSITION_COMPONENTS)
3880         {
3881           /* Too long composition.  */
3882           MAYBE_FINISH_COMPOSITION ();
3883           *charbuf++ = c;
3884           char_offset++;
3885         }
3886       else
3887         STORE_COMPOSITION_CHAR (c);
3888       continue;
3889
3890     invalid_code:
3891       MAYBE_FINISH_COMPOSITION ();
3892       src = src_base;
3893       consumed_chars = consumed_chars_base;
3894       ONE_MORE_BYTE (c);
3895       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3896       char_offset++;
3897       coding->errors++;
3898       continue;
3899
3900     break_loop:
3901       break;
3902     }
3903
3904  no_more_source:
3905   if (cmp_status->state != COMPOSING_NO)
3906     {
3907       if (coding->mode & CODING_MODE_LAST_BLOCK)
3908         MAYBE_FINISH_COMPOSITION ();
3909       else
3910         {
3911           charbuf -= cmp_status->length;
3912           for (i = 0; i < cmp_status->length; i++)
3913             cmp_status->carryover[i] = charbuf[i];
3914         }
3915     }
3916   else if (last_id != charset_ascii)
3917     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3918   coding->consumed_char += consumed_chars_base;
3919   coding->consumed = src_base - coding->source;
3920   coding->charbuf_used = charbuf - coding->charbuf;
3921 }
3922
3923
3924 /* ISO2022 encoding stuff.  */
3925
3926 /*
3927    It is not enough to say just "ISO2022" on encoding, we have to
3928    specify more details.  In Emacs, each coding system of ISO2022
3929    variant has the following specifications:
3930         1. Initial designation to G0 thru G3.
3931         2. Allows short-form designation?
3932         3. ASCII should be designated to G0 before control characters?
3933         4. ASCII should be designated to G0 at end of line?
3934         5. 7-bit environment or 8-bit environment?
3935         6. Use locking-shift?
3936         7. Use Single-shift?
3937    And the following two are only for Japanese:
3938         8. Use ASCII in place of JIS0201-1976-Roman?
3939         9. Use JISX0208-1983 in place of JISX0208-1978?
3940    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3941    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3942    details.
3943 */
3944
3945 /* Produce codes (escape sequence) for designating CHARSET to graphic
3946    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3947    '@', 'A', or 'B' and the coding system CODING allows, produce
3948    designation sequence of short-form.  */
3949
3950 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3951   do {                                                                  \
3952     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3953     const char *intermediate_char_94 = "()*+";                          \
3954     const char *intermediate_char_96 = ",-./";                          \
3955     int revision = -1;                                                  \
3956                                                                         \
3957     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3958       revision = CHARSET_ISO_REVISION (charset);                        \
3959                                                                         \
3960     if (revision >= 0)                                                  \
3961       {                                                                 \
3962         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3963         EMIT_ONE_BYTE ('@' + revision);                                 \
3964       }                                                                 \
3965     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3966     if (CHARSET_DIMENSION (charset) == 1)                               \
3967       {                                                                 \
3968         int b;                                                          \
3969         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3970           b = intermediate_char_94[reg];                                \
3971         else                                                            \
3972           b = intermediate_char_96[reg];                                \
3973         EMIT_ONE_ASCII_BYTE (b);                                        \
3974       }                                                                 \
3975     else                                                                \
3976       {                                                                 \
3977         EMIT_ONE_ASCII_BYTE ('$');                                      \
3978         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3979           {                                                             \
3980             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3981                 || reg != 0                                             \
3982                 || final_char < '@' || final_char > 'B')                \
3983               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3984           }                                                             \
3985         else                                                            \
3986           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3987       }                                                                 \
3988     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3989                                                                         \
3990     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3991   } while (0)
3992
3993
3994 /* The following two macros produce codes (control character or escape
3995    sequence) for ISO2022 single-shift functions (single-shift-2 and
3996    single-shift-3).  */
3997
3998 #define ENCODE_SINGLE_SHIFT_2                                           \
3999   do {                                                                  \
4000     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4001       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4002     else                                                                \
4003       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4004     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4005   } while (0)
4006
4007
4008 #define ENCODE_SINGLE_SHIFT_3                                           \
4009   do {                                                                  \
4010     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4011       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4012     else                                                                \
4013       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4014     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4015   } while (0)
4016
4017
4018 /* The following four macros produce codes (control character or
4019    escape sequence) for ISO2022 locking-shift functions (shift-in,
4020    shift-out, locking-shift-2, and locking-shift-3).  */
4021
4022 #define ENCODE_SHIFT_IN                                 \
4023   do {                                                  \
4024     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4025     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4026   } while (0)
4027
4028
4029 #define ENCODE_SHIFT_OUT                                \
4030   do {                                                  \
4031     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4032     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4033   } while (0)
4034
4035
4036 #define ENCODE_LOCKING_SHIFT_2                          \
4037   do {                                                  \
4038     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4039     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4040   } while (0)
4041
4042
4043 #define ENCODE_LOCKING_SHIFT_3                          \
4044   do {                                                  \
4045     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4046     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4047   } while (0)
4048
4049
4050 /* Produce codes for a DIMENSION1 character whose character set is
4051    CHARSET and whose position-code is C1.  Designation and invocation
4052    sequences are also produced in advance if necessary.  */
4053
4054 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4055   do {                                                                  \
4056     int id = CHARSET_ID (charset);                                      \
4057                                                                         \
4058     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4059         && id == charset_ascii)                                         \
4060       {                                                                 \
4061         id = charset_jisx0201_roman;                                    \
4062         charset = CHARSET_FROM_ID (id);                                 \
4063       }                                                                 \
4064                                                                         \
4065     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4066       {                                                                 \
4067         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4068           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4069         else                                                            \
4070           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4071         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4072         break;                                                          \
4073       }                                                                 \
4074     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4075       {                                                                 \
4076         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4077         break;                                                          \
4078       }                                                                 \
4079     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4080       {                                                                 \
4081         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4082         break;                                                          \
4083       }                                                                 \
4084     else                                                                \
4085       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4086          must invoke it, or, at first, designate it to some graphic     \
4087          register.  Then repeat the loop to actually produce the        \
4088          character.  */                                                 \
4089       dst = encode_invocation_designation (charset, coding, dst,        \
4090                                            &produced_chars);            \
4091   } while (1)
4092
4093
4094 /* Produce codes for a DIMENSION2 character whose character set is
4095    CHARSET and whose position-codes are C1 and C2.  Designation and
4096    invocation codes are also produced in advance if necessary.  */
4097
4098 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4099   do {                                                                  \
4100     int id = CHARSET_ID (charset);                                      \
4101                                                                         \
4102     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4103         && id == charset_jisx0208)                                      \
4104       {                                                                 \
4105         id = charset_jisx0208_1978;                                     \
4106         charset = CHARSET_FROM_ID (id);                                 \
4107       }                                                                 \
4108                                                                         \
4109     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4110       {                                                                 \
4111         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4112           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4113         else                                                            \
4114           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4115         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4116         break;                                                          \
4117       }                                                                 \
4118     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4119       {                                                                 \
4120         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4121         break;                                                          \
4122       }                                                                 \
4123     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4124       {                                                                 \
4125         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4126         break;                                                          \
4127       }                                                                 \
4128     else                                                                \
4129       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4130          must invoke it, or, at first, designate it to some graphic     \
4131          register.  Then repeat the loop to actually produce the        \
4132          character.  */                                                 \
4133       dst = encode_invocation_designation (charset, coding, dst,        \
4134                                            &produced_chars);            \
4135   } while (1)
4136
4137
4138 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4139   do {                                                                     \
4140     unsigned code;                                                         \
4141     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4142                                                                            \
4143     if (CHARSET_DIMENSION (charset) == 1)                                  \
4144       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4145     else                                                                   \
4146       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4147   } while (0)
4148
4149
4150 /* Produce designation and invocation codes at a place pointed by DST
4151    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4152    Return new DST.  */
4153
4154 static unsigned char *
4155 encode_invocation_designation (struct charset *charset,
4156                                struct coding_system *coding,
4157                                unsigned char *dst, ptrdiff_t *p_nchars)
4158 {
4159   bool multibytep = coding->dst_multibyte;
4160   ptrdiff_t produced_chars = *p_nchars;
4161   int reg;                      /* graphic register number */
4162   int id = CHARSET_ID (charset);
4163
4164   /* At first, check designations.  */
4165   for (reg = 0; reg < 4; reg++)
4166     if (id == CODING_ISO_DESIGNATION (coding, reg))
4167       break;
4168
4169   if (reg >= 4)
4170     {
4171       /* CHARSET is not yet designated to any graphic registers.  */
4172       /* At first check the requested designation.  */
4173       reg = CODING_ISO_REQUEST (coding, id);
4174       if (reg < 0)
4175         /* Since CHARSET requests no special designation, designate it
4176            to graphic register 0.  */
4177         reg = 0;
4178
4179       ENCODE_DESIGNATION (charset, reg, coding);
4180     }
4181
4182   if (CODING_ISO_INVOCATION (coding, 0) != reg
4183       && CODING_ISO_INVOCATION (coding, 1) != reg)
4184     {
4185       /* Since the graphic register REG is not invoked to any graphic
4186          planes, invoke it to graphic plane 0.  */
4187       switch (reg)
4188         {
4189         case 0:                 /* graphic register 0 */
4190           ENCODE_SHIFT_IN;
4191           break;
4192
4193         case 1:                 /* graphic register 1 */
4194           ENCODE_SHIFT_OUT;
4195           break;
4196
4197         case 2:                 /* graphic register 2 */
4198           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4199             ENCODE_SINGLE_SHIFT_2;
4200           else
4201             ENCODE_LOCKING_SHIFT_2;
4202           break;
4203
4204         case 3:                 /* graphic register 3 */
4205           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4206             ENCODE_SINGLE_SHIFT_3;
4207           else
4208             ENCODE_LOCKING_SHIFT_3;
4209           break;
4210         }
4211     }
4212
4213   *p_nchars = produced_chars;
4214   return dst;
4215 }
4216
4217
4218 /* Produce codes for designation and invocation to reset the graphic
4219    planes and registers to initial state.  */
4220 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4221   do {                                                                  \
4222     int reg;                                                            \
4223     struct charset *charset;                                            \
4224                                                                         \
4225     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4226       ENCODE_SHIFT_IN;                                                  \
4227     for (reg = 0; reg < 4; reg++)                                       \
4228       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4229           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4230               != CODING_ISO_INITIAL (coding, reg)))                     \
4231         {                                                               \
4232           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4233           ENCODE_DESIGNATION (charset, reg, coding);                    \
4234         }                                                               \
4235   } while (0)
4236
4237
4238 /* Produce designation sequences of charsets in the line started from
4239    CHARBUF to a place pointed by DST, and return the number of
4240    produced bytes.  DST should not directly point a buffer text area
4241    which may be relocated by char_charset call.
4242
4243    If the current block ends before any end-of-line, we may fail to
4244    find all the necessary designations.  */
4245
4246 static ptrdiff_t
4247 encode_designation_at_bol (struct coding_system *coding,
4248                            int *charbuf, int *charbuf_end,
4249                            unsigned char *dst)
4250 {
4251   unsigned char *orig = dst;
4252   struct charset *charset;
4253   /* Table of charsets to be designated to each graphic register.  */
4254   int r[4];
4255   int c, found = 0, reg;
4256   ptrdiff_t produced_chars = 0;
4257   bool multibytep = coding->dst_multibyte;
4258   Lisp_Object attrs;
4259   Lisp_Object charset_list;
4260
4261   attrs = CODING_ID_ATTRS (coding->id);
4262   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4263   if (EQ (charset_list, Qiso_2022))
4264     charset_list = Viso_2022_charset_list;
4265
4266   for (reg = 0; reg < 4; reg++)
4267     r[reg] = -1;
4268
4269   while (charbuf < charbuf_end && found < 4)
4270     {
4271       int id;
4272
4273       c = *charbuf++;
4274       if (c == '\n')
4275         break;
4276       charset = char_charset (c, charset_list, NULL);
4277       id = CHARSET_ID (charset);
4278       reg = CODING_ISO_REQUEST (coding, id);
4279       if (reg >= 0 && r[reg] < 0)
4280         {
4281           found++;
4282           r[reg] = id;
4283         }
4284     }
4285
4286   if (found)
4287     {
4288       for (reg = 0; reg < 4; reg++)
4289         if (r[reg] >= 0
4290             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4291           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4292     }
4293
4294   return dst - orig;
4295 }
4296
4297 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4298
4299 static bool
4300 encode_coding_iso_2022 (struct coding_system *coding)
4301 {
4302   bool multibytep = coding->dst_multibyte;
4303   int *charbuf = coding->charbuf;
4304   int *charbuf_end = charbuf + coding->charbuf_used;
4305   unsigned char *dst = coding->destination + coding->produced;
4306   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4307   int safe_room = 16;
4308   bool bol_designation
4309     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4310        && CODING_ISO_BOL (coding));
4311   ptrdiff_t produced_chars = 0;
4312   Lisp_Object attrs, eol_type, charset_list;
4313   bool ascii_compatible;
4314   int c;
4315   int preferred_charset_id = -1;
4316
4317   CODING_GET_INFO (coding, attrs, charset_list);
4318   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4319   if (VECTORP (eol_type))
4320     eol_type = Qunix;
4321
4322   setup_iso_safe_charsets (attrs);
4323   /* Charset list may have been changed.  */
4324   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4325   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4326
4327   ascii_compatible
4328     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4329        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4330                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4331
4332   while (charbuf < charbuf_end)
4333     {
4334       ASSURE_DESTINATION (safe_room);
4335
4336       if (bol_designation)
4337         {
4338           /* We have to produce designation sequences if any now.  */
4339           unsigned char desig_buf[16];
4340           int nbytes;
4341           ptrdiff_t offset;
4342
4343           charset_map_loaded = 0;
4344           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4345                                               desig_buf);
4346           if (charset_map_loaded
4347               && (offset = coding_change_destination (coding)))
4348             {
4349               dst += offset;
4350               dst_end += offset;
4351             }
4352           memcpy (dst, desig_buf, nbytes);
4353           dst += nbytes;
4354           /* We are sure that designation sequences are all ASCII bytes.  */
4355           produced_chars += nbytes;
4356           bol_designation = 0;
4357           ASSURE_DESTINATION (safe_room);
4358         }
4359
4360       c = *charbuf++;
4361
4362       if (c < 0)
4363         {
4364           /* Handle an annotation.  */
4365           switch (*charbuf)
4366             {
4367             case CODING_ANNOTATE_COMPOSITION_MASK:
4368               /* Not yet implemented.  */
4369               break;
4370             case CODING_ANNOTATE_CHARSET_MASK:
4371               preferred_charset_id = charbuf[2];
4372               if (preferred_charset_id >= 0
4373                   && NILP (Fmemq (make_number (preferred_charset_id),
4374                                   charset_list)))
4375                 preferred_charset_id = -1;
4376               break;
4377             default:
4378               emacs_abort ();
4379             }
4380           charbuf += -c - 1;
4381           continue;
4382         }
4383
4384       /* Now encode the character C.  */
4385       if (c < 0x20 || c == 0x7F)
4386         {
4387           if (c == '\n'
4388               || (c == '\r' && EQ (eol_type, Qmac)))
4389             {
4390               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4391                 ENCODE_RESET_PLANE_AND_REGISTER ();
4392               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4393                 {
4394                   int i;
4395
4396                   for (i = 0; i < 4; i++)
4397                     CODING_ISO_DESIGNATION (coding, i)
4398                       = CODING_ISO_INITIAL (coding, i);
4399                 }
4400               bol_designation = ((CODING_ISO_FLAGS (coding)
4401                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4402                                  != 0);
4403             }
4404           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4405             ENCODE_RESET_PLANE_AND_REGISTER ();
4406           EMIT_ONE_ASCII_BYTE (c);
4407         }
4408       else if (ASCII_CHAR_P (c))
4409         {
4410           if (ascii_compatible)
4411             EMIT_ONE_ASCII_BYTE (c);
4412           else
4413             {
4414               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4415               ENCODE_ISO_CHARACTER (charset, c);
4416             }
4417         }
4418       else if (CHAR_BYTE8_P (c))
4419         {
4420           c = CHAR_TO_BYTE8 (c);
4421           EMIT_ONE_BYTE (c);
4422         }
4423       else
4424         {
4425           struct charset *charset;
4426
4427           if (preferred_charset_id >= 0)
4428             {
4429               bool result;
4430
4431               charset = CHARSET_FROM_ID (preferred_charset_id);
4432               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4433               if (! result)
4434                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4435                                      NULL, charset);
4436             }
4437           else
4438             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4439                                  NULL, charset);
4440           if (!charset)
4441             {
4442               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4443                 {
4444                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4445                   charset = CHARSET_FROM_ID (charset_ascii);
4446                 }
4447               else
4448                 {
4449                   c = coding->default_char;
4450                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4451                                        charset_list, NULL, charset);
4452                 }
4453             }
4454           ENCODE_ISO_CHARACTER (charset, c);
4455         }
4456     }
4457
4458   if (coding->mode & CODING_MODE_LAST_BLOCK
4459       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4460     {
4461       ASSURE_DESTINATION (safe_room);
4462       ENCODE_RESET_PLANE_AND_REGISTER ();
4463     }
4464   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4465   CODING_ISO_BOL (coding) = bol_designation;
4466   coding->produced_char += produced_chars;
4467   coding->produced = dst - coding->destination;
4468   return 0;
4469 }
4470
4471 \f
4472 /*** 8,9. SJIS and BIG5 handlers ***/
4473
4474 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4475    quite widely.  So, for the moment, Emacs supports them in the bare
4476    C code.  But, in the future, they may be supported only by CCL.  */
4477
4478 /* SJIS is a coding system encoding three character sets: ASCII, right
4479    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4480    as is.  A character of charset katakana-jisx0201 is encoded by
4481    "position-code + 0x80".  A character of charset japanese-jisx0208
4482    is encoded in 2-byte but two position-codes are divided and shifted
4483    so that it fit in the range below.
4484
4485    --- CODE RANGE of SJIS ---
4486    (character set)      (range)
4487    ASCII                0x00 .. 0x7F
4488    KATAKANA-JISX0201    0xA0 .. 0xDF
4489    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4490             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4491    -------------------------------
4492
4493 */
4494
4495 /* BIG5 is a coding system encoding two character sets: ASCII and
4496    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4497    character set and is encoded in two-byte.
4498
4499    --- CODE RANGE of BIG5 ---
4500    (character set)      (range)
4501    ASCII                0x00 .. 0x7F
4502    Big5 (1st byte)      0xA1 .. 0xFE
4503         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4504    --------------------------
4505
4506   */
4507
4508 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4509    Return true if a text is encoded in SJIS.  */
4510
4511 static bool
4512 detect_coding_sjis (struct coding_system *coding,
4513                     struct coding_detection_info *detect_info)
4514 {
4515   const unsigned char *src = coding->source, *src_base;
4516   const unsigned char *src_end = coding->source + coding->src_bytes;
4517   bool multibytep = coding->src_multibyte;
4518   ptrdiff_t consumed_chars = 0;
4519   int found = 0;
4520   int c;
4521   Lisp_Object attrs, charset_list;
4522   int max_first_byte_of_2_byte_code;
4523
4524   CODING_GET_INFO (coding, attrs, charset_list);
4525   max_first_byte_of_2_byte_code
4526     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4527
4528   detect_info->checked |= CATEGORY_MASK_SJIS;
4529   /* A coding system of this category is always ASCII compatible.  */
4530   src += coding->head_ascii;
4531
4532   while (1)
4533     {
4534       src_base = src;
4535       ONE_MORE_BYTE (c);
4536       if (c < 0x80)
4537         continue;
4538       if ((c >= 0x81 && c <= 0x9F)
4539           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4540         {
4541           ONE_MORE_BYTE (c);
4542           if (c < 0x40 || c == 0x7F || c > 0xFC)
4543             break;
4544           found = CATEGORY_MASK_SJIS;
4545         }
4546       else if (c >= 0xA0 && c < 0xE0)
4547         found = CATEGORY_MASK_SJIS;
4548       else
4549         break;
4550     }
4551   detect_info->rejected |= CATEGORY_MASK_SJIS;
4552   return 0;
4553
4554  no_more_source:
4555   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4556     {
4557       detect_info->rejected |= CATEGORY_MASK_SJIS;
4558       return 0;
4559     }
4560   detect_info->found |= found;
4561   return 1;
4562 }
4563
4564 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4565    Return true if a text is encoded in BIG5.  */
4566
4567 static bool
4568 detect_coding_big5 (struct coding_system *coding,
4569                     struct coding_detection_info *detect_info)
4570 {
4571   const unsigned char *src = coding->source, *src_base;
4572   const unsigned char *src_end = coding->source + coding->src_bytes;
4573   bool multibytep = coding->src_multibyte;
4574   ptrdiff_t consumed_chars = 0;
4575   int found = 0;
4576   int c;
4577
4578   detect_info->checked |= CATEGORY_MASK_BIG5;
4579   /* A coding system of this category is always ASCII compatible.  */
4580   src += coding->head_ascii;
4581
4582   while (1)
4583     {
4584       src_base = src;
4585       ONE_MORE_BYTE (c);
4586       if (c < 0x80)
4587         continue;
4588       if (c >= 0xA1)
4589         {
4590           ONE_MORE_BYTE (c);
4591           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4592             return 0;
4593           found = CATEGORY_MASK_BIG5;
4594         }
4595       else
4596         break;
4597     }
4598   detect_info->rejected |= CATEGORY_MASK_BIG5;
4599   return 0;
4600
4601  no_more_source:
4602   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4603     {
4604       detect_info->rejected |= CATEGORY_MASK_BIG5;
4605       return 0;
4606     }
4607   detect_info->found |= found;
4608   return 1;
4609 }
4610
4611 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4612
4613 static void
4614 decode_coding_sjis (struct coding_system *coding)
4615 {
4616   const unsigned char *src = coding->source + coding->consumed;
4617   const unsigned char *src_end = coding->source + coding->src_bytes;
4618   const unsigned char *src_base;
4619   int *charbuf = coding->charbuf + coding->charbuf_used;
4620   /* We may produce one charset annotation in one loop and one more at
4621      the end.  */
4622   int *charbuf_end
4623     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4624   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4625   bool multibytep = coding->src_multibyte;
4626   struct charset *charset_roman, *charset_kanji, *charset_kana;
4627   struct charset *charset_kanji2;
4628   Lisp_Object attrs, charset_list, val;
4629   ptrdiff_t char_offset = coding->produced_char;
4630   ptrdiff_t last_offset = char_offset;
4631   int last_id = charset_ascii;
4632   bool eol_dos
4633     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4634   int byte_after_cr = -1;
4635
4636   CODING_GET_INFO (coding, attrs, charset_list);
4637
4638   val = charset_list;
4639   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4640   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4641   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4642   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4643
4644   while (1)
4645     {
4646       int c, c1;
4647       struct charset *charset;
4648
4649       src_base = src;
4650       consumed_chars_base = consumed_chars;
4651
4652       if (charbuf >= charbuf_end)
4653         {
4654           if (byte_after_cr >= 0)
4655             src_base--;
4656           break;
4657         }
4658
4659       if (byte_after_cr >= 0)
4660         c = byte_after_cr, byte_after_cr = -1;
4661       else
4662         ONE_MORE_BYTE (c);
4663       if (c < 0)
4664         goto invalid_code;
4665       if (c < 0x80)
4666         {
4667           if (eol_dos && c == '\r')
4668             ONE_MORE_BYTE (byte_after_cr);
4669           charset = charset_roman;
4670         }
4671       else if (c == 0x80 || c == 0xA0)
4672         goto invalid_code;
4673       else if (c >= 0xA1 && c <= 0xDF)
4674         {
4675           /* SJIS -> JISX0201-Kana */
4676           c &= 0x7F;
4677           charset = charset_kana;
4678         }
4679       else if (c <= 0xEF)
4680         {
4681           /* SJIS -> JISX0208 */
4682           ONE_MORE_BYTE (c1);
4683           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4684             goto invalid_code;
4685           c = (c << 8) | c1;
4686           SJIS_TO_JIS (c);
4687           charset = charset_kanji;
4688         }
4689       else if (c <= 0xFC && charset_kanji2)
4690         {
4691           /* SJIS -> JISX0213-2 */
4692           ONE_MORE_BYTE (c1);
4693           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4694             goto invalid_code;
4695           c = (c << 8) | c1;
4696           SJIS_TO_JIS2 (c);
4697           charset = charset_kanji2;
4698         }
4699       else
4700         goto invalid_code;
4701       if (charset->id != charset_ascii
4702           && last_id != charset->id)
4703         {
4704           if (last_id != charset_ascii)
4705             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4706           last_id = charset->id;
4707           last_offset = char_offset;
4708         }
4709       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4710       *charbuf++ = c;
4711       char_offset++;
4712       continue;
4713
4714     invalid_code:
4715       src = src_base;
4716       consumed_chars = consumed_chars_base;
4717       ONE_MORE_BYTE (c);
4718       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4719       char_offset++;
4720       coding->errors++;
4721     }
4722
4723  no_more_source:
4724   if (last_id != charset_ascii)
4725     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4726   coding->consumed_char += consumed_chars_base;
4727   coding->consumed = src_base - coding->source;
4728   coding->charbuf_used = charbuf - coding->charbuf;
4729 }
4730
4731 static void
4732 decode_coding_big5 (struct coding_system *coding)
4733 {
4734   const unsigned char *src = coding->source + coding->consumed;
4735   const unsigned char *src_end = coding->source + coding->src_bytes;
4736   const unsigned char *src_base;
4737   int *charbuf = coding->charbuf + coding->charbuf_used;
4738   /* We may produce one charset annotation in one loop and one more at
4739      the end.  */
4740   int *charbuf_end
4741     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4742   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4743   bool multibytep = coding->src_multibyte;
4744   struct charset *charset_roman, *charset_big5;
4745   Lisp_Object attrs, charset_list, val;
4746   ptrdiff_t char_offset = coding->produced_char;
4747   ptrdiff_t last_offset = char_offset;
4748   int last_id = charset_ascii;
4749   bool eol_dos
4750     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4751   int byte_after_cr = -1;
4752
4753   CODING_GET_INFO (coding, attrs, charset_list);
4754   val = charset_list;
4755   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4756   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4757
4758   while (1)
4759     {
4760       int c, c1;
4761       struct charset *charset;
4762
4763       src_base = src;
4764       consumed_chars_base = consumed_chars;
4765
4766       if (charbuf >= charbuf_end)
4767         {
4768           if (byte_after_cr >= 0)
4769             src_base--;
4770           break;
4771         }
4772
4773       if (byte_after_cr >= 0)
4774         c = byte_after_cr, byte_after_cr = -1;
4775       else
4776         ONE_MORE_BYTE (c);
4777
4778       if (c < 0)
4779         goto invalid_code;
4780       if (c < 0x80)
4781         {
4782           if (eol_dos && c == '\r')
4783             ONE_MORE_BYTE (byte_after_cr);
4784           charset = charset_roman;
4785         }
4786       else
4787         {
4788           /* BIG5 -> Big5 */
4789           if (c < 0xA1 || c > 0xFE)
4790             goto invalid_code;
4791           ONE_MORE_BYTE (c1);
4792           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4793             goto invalid_code;
4794           c = c << 8 | c1;
4795           charset = charset_big5;
4796         }
4797       if (charset->id != charset_ascii
4798           && last_id != charset->id)
4799         {
4800           if (last_id != charset_ascii)
4801             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4802           last_id = charset->id;
4803           last_offset = char_offset;
4804         }
4805       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4806       *charbuf++ = c;
4807       char_offset++;
4808       continue;
4809
4810     invalid_code:
4811       src = src_base;
4812       consumed_chars = consumed_chars_base;
4813       ONE_MORE_BYTE (c);
4814       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4815       char_offset++;
4816       coding->errors++;
4817     }
4818
4819  no_more_source:
4820   if (last_id != charset_ascii)
4821     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4822   coding->consumed_char += consumed_chars_base;
4823   coding->consumed = src_base - coding->source;
4824   coding->charbuf_used = charbuf - coding->charbuf;
4825 }
4826
4827 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4828    This function can encode charsets `ascii', `katakana-jisx0201',
4829    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4830    are sure that all these charsets are registered as official charset
4831    (i.e. do not have extended leading-codes).  Characters of other
4832    charsets are produced without any encoding.  */
4833
4834 static bool
4835 encode_coding_sjis (struct coding_system *coding)
4836 {
4837   bool multibytep = coding->dst_multibyte;
4838   int *charbuf = coding->charbuf;
4839   int *charbuf_end = charbuf + coding->charbuf_used;
4840   unsigned char *dst = coding->destination + coding->produced;
4841   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4842   int safe_room = 4;
4843   ptrdiff_t produced_chars = 0;
4844   Lisp_Object attrs, charset_list, val;
4845   bool ascii_compatible;
4846   struct charset *charset_kanji, *charset_kana;
4847   struct charset *charset_kanji2;
4848   int c;
4849
4850   CODING_GET_INFO (coding, attrs, charset_list);
4851   val = XCDR (charset_list);
4852   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4853   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4854   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4855
4856   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4857
4858   while (charbuf < charbuf_end)
4859     {
4860       ASSURE_DESTINATION (safe_room);
4861       c = *charbuf++;
4862       /* Now encode the character C.  */
4863       if (ASCII_CHAR_P (c) && ascii_compatible)
4864         EMIT_ONE_ASCII_BYTE (c);
4865       else if (CHAR_BYTE8_P (c))
4866         {
4867           c = CHAR_TO_BYTE8 (c);
4868           EMIT_ONE_BYTE (c);
4869         }
4870       else
4871         {
4872           unsigned code;
4873           struct charset *charset;
4874           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4875                                &code, charset);
4876
4877           if (!charset)
4878             {
4879               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4880                 {
4881                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4882                   charset = CHARSET_FROM_ID (charset_ascii);
4883                 }
4884               else
4885                 {
4886                   c = coding->default_char;
4887                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4888                                        charset_list, &code, charset);
4889                 }
4890             }
4891           if (code == CHARSET_INVALID_CODE (charset))
4892             emacs_abort ();
4893           if (charset == charset_kanji)
4894             {
4895               int c1, c2;
4896               JIS_TO_SJIS (code);
4897               c1 = code >> 8, c2 = code & 0xFF;
4898               EMIT_TWO_BYTES (c1, c2);
4899             }
4900           else if (charset == charset_kana)
4901             EMIT_ONE_BYTE (code | 0x80);
4902           else if (charset_kanji2 && charset == charset_kanji2)
4903             {
4904               int c1, c2;
4905
4906               c1 = code >> 8;
4907               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4908                   || c1 == 0x28
4909                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4910                 {
4911                   JIS_TO_SJIS2 (code);
4912                   c1 = code >> 8, c2 = code & 0xFF;
4913                   EMIT_TWO_BYTES (c1, c2);
4914                 }
4915               else
4916                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4917             }
4918           else
4919             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4920         }
4921     }
4922   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4923   coding->produced_char += produced_chars;
4924   coding->produced = dst - coding->destination;
4925   return 0;
4926 }
4927
4928 static bool
4929 encode_coding_big5 (struct coding_system *coding)
4930 {
4931   bool multibytep = coding->dst_multibyte;
4932   int *charbuf = coding->charbuf;
4933   int *charbuf_end = charbuf + coding->charbuf_used;
4934   unsigned char *dst = coding->destination + coding->produced;
4935   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4936   int safe_room = 4;
4937   ptrdiff_t produced_chars = 0;
4938   Lisp_Object attrs, charset_list, val;
4939   bool ascii_compatible;
4940   struct charset *charset_big5;
4941   int c;
4942
4943   CODING_GET_INFO (coding, attrs, charset_list);
4944   val = XCDR (charset_list);
4945   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4946   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4947
4948   while (charbuf < charbuf_end)
4949     {
4950       ASSURE_DESTINATION (safe_room);
4951       c = *charbuf++;
4952       /* Now encode the character C.  */
4953       if (ASCII_CHAR_P (c) && ascii_compatible)
4954         EMIT_ONE_ASCII_BYTE (c);
4955       else if (CHAR_BYTE8_P (c))
4956         {
4957           c = CHAR_TO_BYTE8 (c);
4958           EMIT_ONE_BYTE (c);
4959         }
4960       else
4961         {
4962           unsigned code;
4963           struct charset *charset;
4964           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4965                                &code, charset);
4966
4967           if (! charset)
4968             {
4969               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4970                 {
4971                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4972                   charset = CHARSET_FROM_ID (charset_ascii);
4973                 }
4974               else
4975                 {
4976                   c = coding->default_char;
4977                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4978                                        charset_list, &code, charset);
4979                 }
4980             }
4981           if (code == CHARSET_INVALID_CODE (charset))
4982             emacs_abort ();
4983           if (charset == charset_big5)
4984             {
4985               int c1, c2;
4986
4987               c1 = code >> 8, c2 = code & 0xFF;
4988               EMIT_TWO_BYTES (c1, c2);
4989             }
4990           else
4991             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4992         }
4993     }
4994   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4995   coding->produced_char += produced_chars;
4996   coding->produced = dst - coding->destination;
4997   return 0;
4998 }
4999
5000 \f
5001 /*** 10. CCL handlers ***/
5002
5003 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5004    Return true if a text is encoded in a coding system of which
5005    encoder/decoder are written in CCL program.  */
5006
5007 static bool
5008 detect_coding_ccl (struct coding_system *coding,
5009                    struct coding_detection_info *detect_info)
5010 {
5011   const unsigned char *src = coding->source, *src_base;
5012   const unsigned char *src_end = coding->source + coding->src_bytes;
5013   bool multibytep = coding->src_multibyte;
5014   ptrdiff_t consumed_chars = 0;
5015   int found = 0;
5016   unsigned char *valids;
5017   ptrdiff_t head_ascii = coding->head_ascii;
5018   Lisp_Object attrs;
5019
5020   detect_info->checked |= CATEGORY_MASK_CCL;
5021
5022   coding = &coding_categories[coding_category_ccl];
5023   valids = CODING_CCL_VALIDS (coding);
5024   attrs = CODING_ID_ATTRS (coding->id);
5025   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5026     src += head_ascii;
5027
5028   while (1)
5029     {
5030       int c;
5031
5032       src_base = src;
5033       ONE_MORE_BYTE (c);
5034       if (c < 0 || ! valids[c])
5035         break;
5036       if ((valids[c] > 1))
5037         found = CATEGORY_MASK_CCL;
5038     }
5039   detect_info->rejected |= CATEGORY_MASK_CCL;
5040   return 0;
5041
5042  no_more_source:
5043   detect_info->found |= found;
5044   return 1;
5045 }
5046
5047 static void
5048 decode_coding_ccl (struct coding_system *coding)
5049 {
5050   const unsigned char *src = coding->source + coding->consumed;
5051   const unsigned char *src_end = coding->source + coding->src_bytes;
5052   int *charbuf = coding->charbuf + coding->charbuf_used;
5053   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5054   ptrdiff_t consumed_chars = 0;
5055   bool multibytep = coding->src_multibyte;
5056   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5057   int source_charbuf[1024];
5058   int source_byteidx[1025];
5059   Lisp_Object attrs, charset_list;
5060
5061   CODING_GET_INFO (coding, attrs, charset_list);
5062
5063   while (1)
5064     {
5065       const unsigned char *p = src;
5066       ptrdiff_t offset;
5067       int i = 0;
5068
5069       if (multibytep)
5070         {
5071           while (i < 1024 && p < src_end)
5072             {
5073               source_byteidx[i] = p - src;
5074               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5075             }
5076           source_byteidx[i] = p - src;
5077         }
5078       else
5079         while (i < 1024 && p < src_end)
5080           source_charbuf[i++] = *p++;
5081
5082       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5083         ccl->last_block = 1;
5084       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5085       charset_map_loaded = 0;
5086       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5087                   charset_list);
5088       if (charset_map_loaded
5089           && (offset = coding_change_source (coding)))
5090         {
5091           p += offset;
5092           src += offset;
5093           src_end += offset;
5094         }
5095       charbuf += ccl->produced;
5096       if (multibytep)
5097         src += source_byteidx[ccl->consumed];
5098       else
5099         src += ccl->consumed;
5100       consumed_chars += ccl->consumed;
5101       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5102         break;
5103     }
5104
5105   switch (ccl->status)
5106     {
5107     case CCL_STAT_SUSPEND_BY_SRC:
5108       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5109       break;
5110     case CCL_STAT_SUSPEND_BY_DST:
5111       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5112       break;
5113     case CCL_STAT_QUIT:
5114     case CCL_STAT_INVALID_CMD:
5115       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5116       break;
5117     default:
5118       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5119       break;
5120     }
5121   coding->consumed_char += consumed_chars;
5122   coding->consumed = src - coding->source;
5123   coding->charbuf_used = charbuf - coding->charbuf;
5124 }
5125
5126 static bool
5127 encode_coding_ccl (struct coding_system *coding)
5128 {
5129   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5130   bool multibytep = coding->dst_multibyte;
5131   int *charbuf = coding->charbuf;
5132   int *charbuf_end = charbuf + coding->charbuf_used;
5133   unsigned char *dst = coding->destination + coding->produced;
5134   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5135   int destination_charbuf[1024];
5136   ptrdiff_t produced_chars = 0;
5137   int i;
5138   Lisp_Object attrs, charset_list;
5139
5140   CODING_GET_INFO (coding, attrs, charset_list);
5141   if (coding->consumed_char == coding->src_chars
5142       && coding->mode & CODING_MODE_LAST_BLOCK)
5143     ccl->last_block = 1;
5144
5145   do
5146     {
5147       ptrdiff_t offset;
5148
5149       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5150       charset_map_loaded = 0;
5151       ccl_driver (ccl, charbuf, destination_charbuf,
5152                   charbuf_end - charbuf, 1024, charset_list);
5153       if (charset_map_loaded
5154           && (offset = coding_change_destination (coding)))
5155         dst += offset;
5156       if (multibytep)
5157         {
5158           ASSURE_DESTINATION (ccl->produced * 2);
5159           for (i = 0; i < ccl->produced; i++)
5160             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5161         }
5162       else
5163         {
5164           ASSURE_DESTINATION (ccl->produced);
5165           for (i = 0; i < ccl->produced; i++)
5166             *dst++ = destination_charbuf[i] & 0xFF;
5167           produced_chars += ccl->produced;
5168         }
5169       charbuf += ccl->consumed;
5170       if (ccl->status == CCL_STAT_QUIT
5171           || ccl->status == CCL_STAT_INVALID_CMD)
5172         break;
5173     }
5174   while (charbuf < charbuf_end);
5175
5176   switch (ccl->status)
5177     {
5178     case CCL_STAT_SUSPEND_BY_SRC:
5179       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5180       break;
5181     case CCL_STAT_SUSPEND_BY_DST:
5182       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5183       break;
5184     case CCL_STAT_QUIT:
5185     case CCL_STAT_INVALID_CMD:
5186       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5187       break;
5188     default:
5189       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5190       break;
5191     }
5192
5193   coding->produced_char += produced_chars;
5194   coding->produced = dst - coding->destination;
5195   return 0;
5196 }
5197
5198 \f
5199 /*** 10, 11. no-conversion handlers ***/
5200
5201 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5202
5203 static void
5204 decode_coding_raw_text (struct coding_system *coding)
5205 {
5206   bool eol_dos
5207     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5208
5209   coding->chars_at_source = 1;
5210   coding->consumed_char = coding->src_chars;
5211   coding->consumed = coding->src_bytes;
5212   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5213     {
5214       coding->consumed_char--;
5215       coding->consumed--;
5216       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5217     }
5218   else
5219     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5220 }
5221
5222 static bool
5223 encode_coding_raw_text (struct coding_system *coding)
5224 {
5225   bool multibytep = coding->dst_multibyte;
5226   int *charbuf = coding->charbuf;
5227   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5228   unsigned char *dst = coding->destination + coding->produced;
5229   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5230   ptrdiff_t produced_chars = 0;
5231   int c;
5232
5233   if (multibytep)
5234     {
5235       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5236
5237       if (coding->src_multibyte)
5238         while (charbuf < charbuf_end)
5239           {
5240             ASSURE_DESTINATION (safe_room);
5241             c = *charbuf++;
5242             if (ASCII_CHAR_P (c))
5243               EMIT_ONE_ASCII_BYTE (c);
5244             else if (CHAR_BYTE8_P (c))
5245               {
5246                 c = CHAR_TO_BYTE8 (c);
5247                 EMIT_ONE_BYTE (c);
5248               }
5249             else
5250               {
5251                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5252
5253                 CHAR_STRING_ADVANCE (c, p1);
5254                 do
5255                   {
5256                     EMIT_ONE_BYTE (*p0);
5257                     p0++;
5258                   }
5259                 while (p0 < p1);
5260               }
5261           }
5262       else
5263         while (charbuf < charbuf_end)
5264           {
5265             ASSURE_DESTINATION (safe_room);
5266             c = *charbuf++;
5267             EMIT_ONE_BYTE (c);
5268           }
5269     }
5270   else
5271     {
5272       if (coding->src_multibyte)
5273         {
5274           int safe_room = MAX_MULTIBYTE_LENGTH;
5275
5276           while (charbuf < charbuf_end)
5277             {
5278               ASSURE_DESTINATION (safe_room);
5279               c = *charbuf++;
5280               if (ASCII_CHAR_P (c))
5281                 *dst++ = c;
5282               else if (CHAR_BYTE8_P (c))
5283                 *dst++ = CHAR_TO_BYTE8 (c);
5284               else
5285                 CHAR_STRING_ADVANCE (c, dst);
5286             }
5287         }
5288       else
5289         {
5290           ASSURE_DESTINATION (charbuf_end - charbuf);
5291           while (charbuf < charbuf_end && dst < dst_end)
5292             *dst++ = *charbuf++;
5293         }
5294       produced_chars = dst - (coding->destination + coding->produced);
5295     }
5296   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5297   coding->produced_char += produced_chars;
5298   coding->produced = dst - coding->destination;
5299   return 0;
5300 }
5301
5302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5303    Return true if a text is encoded in a charset-based coding system.  */
5304
5305 static bool
5306 detect_coding_charset (struct coding_system *coding,
5307                        struct coding_detection_info *detect_info)
5308 {
5309   const unsigned char *src = coding->source, *src_base;
5310   const unsigned char *src_end = coding->source + coding->src_bytes;
5311   bool multibytep = coding->src_multibyte;
5312   ptrdiff_t consumed_chars = 0;
5313   Lisp_Object attrs, valids, name;
5314   int found = 0;
5315   ptrdiff_t head_ascii = coding->head_ascii;
5316   bool check_latin_extra = 0;
5317
5318   detect_info->checked |= CATEGORY_MASK_CHARSET;
5319
5320   coding = &coding_categories[coding_category_charset];
5321   attrs = CODING_ID_ATTRS (coding->id);
5322   valids = AREF (attrs, coding_attr_charset_valids);
5323   name = CODING_ID_NAME (coding->id);
5324   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5325                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5326       || strncmp (SSDATA (SYMBOL_NAME (name)),
5327                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5328     check_latin_extra = 1;
5329
5330   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5331     src += head_ascii;
5332
5333   while (1)
5334     {
5335       int c;
5336       Lisp_Object val;
5337       struct charset *charset;
5338       int dim, idx;
5339
5340       src_base = src;
5341       ONE_MORE_BYTE (c);
5342       if (c < 0)
5343         continue;
5344       val = AREF (valids, c);
5345       if (NILP (val))
5346         break;
5347       if (c >= 0x80)
5348         {
5349           if (c < 0xA0
5350               && check_latin_extra
5351               && (!VECTORP (Vlatin_extra_code_table)
5352                   || NILP (AREF (Vlatin_extra_code_table, c))))
5353             break;
5354           found = CATEGORY_MASK_CHARSET;
5355         }
5356       if (INTEGERP (val))
5357         {
5358           charset = CHARSET_FROM_ID (XFASTINT (val));
5359           dim = CHARSET_DIMENSION (charset);
5360           for (idx = 1; idx < dim; idx++)
5361             {
5362               if (src == src_end)
5363                 goto too_short;
5364               ONE_MORE_BYTE (c);
5365               if (c < charset->code_space[(dim - 1 - idx) * 4]
5366                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5367                 break;
5368             }
5369           if (idx < dim)
5370             break;
5371         }
5372       else
5373         {
5374           idx = 1;
5375           for (; CONSP (val); val = XCDR (val))
5376             {
5377               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5378               dim = CHARSET_DIMENSION (charset);
5379               while (idx < dim)
5380                 {
5381                   if (src == src_end)
5382                     goto too_short;
5383                   ONE_MORE_BYTE (c);
5384                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5385                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5386                     break;
5387                   idx++;
5388                 }
5389               if (idx == dim)
5390                 {
5391                   val = Qnil;
5392                   break;
5393                 }
5394             }
5395           if (CONSP (val))
5396             break;
5397         }
5398     }
5399  too_short:
5400   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5401   return 0;
5402
5403  no_more_source:
5404   detect_info->found |= found;
5405   return 1;
5406 }
5407
5408 static void
5409 decode_coding_charset (struct coding_system *coding)
5410 {
5411   const unsigned char *src = coding->source + coding->consumed;
5412   const unsigned char *src_end = coding->source + coding->src_bytes;
5413   const unsigned char *src_base;
5414   int *charbuf = coding->charbuf + coding->charbuf_used;
5415   /* We may produce one charset annotation in one loop and one more at
5416      the end.  */
5417   int *charbuf_end
5418     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5419   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5420   bool multibytep = coding->src_multibyte;
5421   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5422   Lisp_Object valids;
5423   ptrdiff_t char_offset = coding->produced_char;
5424   ptrdiff_t last_offset = char_offset;
5425   int last_id = charset_ascii;
5426   bool eol_dos
5427     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5428   int byte_after_cr = -1;
5429
5430   valids = AREF (attrs, coding_attr_charset_valids);
5431
5432   while (1)
5433     {
5434       int c;
5435       Lisp_Object val;
5436       struct charset *charset;
5437       int dim;
5438       int len = 1;
5439       unsigned code;
5440
5441       src_base = src;
5442       consumed_chars_base = consumed_chars;
5443
5444       if (charbuf >= charbuf_end)
5445         {
5446           if (byte_after_cr >= 0)
5447             src_base--;
5448           break;
5449         }
5450
5451       if (byte_after_cr >= 0)
5452         {
5453           c = byte_after_cr;
5454           byte_after_cr = -1;
5455         }
5456       else
5457         {
5458           ONE_MORE_BYTE (c);
5459           if (eol_dos && c == '\r')
5460             ONE_MORE_BYTE (byte_after_cr);
5461         }
5462       if (c < 0)
5463         goto invalid_code;
5464       code = c;
5465
5466       val = AREF (valids, c);
5467       if (! INTEGERP (val) && ! CONSP (val))
5468         goto invalid_code;
5469       if (INTEGERP (val))
5470         {
5471           charset = CHARSET_FROM_ID (XFASTINT (val));
5472           dim = CHARSET_DIMENSION (charset);
5473           while (len < dim)
5474             {
5475               ONE_MORE_BYTE (c);
5476               code = (code << 8) | c;
5477               len++;
5478             }
5479           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5480                               charset, code, c);
5481         }
5482       else
5483         {
5484           /* VAL is a list of charset IDs.  It is assured that the
5485              list is sorted by charset dimensions (smaller one
5486              comes first).  */
5487           while (CONSP (val))
5488             {
5489               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5490               dim = CHARSET_DIMENSION (charset);
5491               while (len < dim)
5492                 {
5493                   ONE_MORE_BYTE (c);
5494                   code = (code << 8) | c;
5495                   len++;
5496                 }
5497               CODING_DECODE_CHAR (coding, src, src_base,
5498                                   src_end, charset, code, c);
5499               if (c >= 0)
5500                 break;
5501               val = XCDR (val);
5502             }
5503         }
5504       if (c < 0)
5505         goto invalid_code;
5506       if (charset->id != charset_ascii
5507           && last_id != charset->id)
5508         {
5509           if (last_id != charset_ascii)
5510             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5511           last_id = charset->id;
5512           last_offset = char_offset;
5513         }
5514
5515       *charbuf++ = c;
5516       char_offset++;
5517       continue;
5518
5519     invalid_code:
5520       src = src_base;
5521       consumed_chars = consumed_chars_base;
5522       ONE_MORE_BYTE (c);
5523       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5524       char_offset++;
5525       coding->errors++;
5526     }
5527
5528  no_more_source:
5529   if (last_id != charset_ascii)
5530     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5531   coding->consumed_char += consumed_chars_base;
5532   coding->consumed = src_base - coding->source;
5533   coding->charbuf_used = charbuf - coding->charbuf;
5534 }
5535
5536 static bool
5537 encode_coding_charset (struct coding_system *coding)
5538 {
5539   bool multibytep = coding->dst_multibyte;
5540   int *charbuf = coding->charbuf;
5541   int *charbuf_end = charbuf + coding->charbuf_used;
5542   unsigned char *dst = coding->destination + coding->produced;
5543   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5544   int safe_room = MAX_MULTIBYTE_LENGTH;
5545   ptrdiff_t produced_chars = 0;
5546   Lisp_Object attrs, charset_list;
5547   bool ascii_compatible;
5548   int c;
5549
5550   CODING_GET_INFO (coding, attrs, charset_list);
5551   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5552
5553   while (charbuf < charbuf_end)
5554     {
5555       struct charset *charset;
5556       unsigned code;
5557
5558       ASSURE_DESTINATION (safe_room);
5559       c = *charbuf++;
5560       if (ascii_compatible && ASCII_CHAR_P (c))
5561         EMIT_ONE_ASCII_BYTE (c);
5562       else if (CHAR_BYTE8_P (c))
5563         {
5564           c = CHAR_TO_BYTE8 (c);
5565           EMIT_ONE_BYTE (c);
5566         }
5567       else
5568         {
5569           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5570                                &code, charset);
5571
5572           if (charset)
5573             {
5574               if (CHARSET_DIMENSION (charset) == 1)
5575                 EMIT_ONE_BYTE (code);
5576               else if (CHARSET_DIMENSION (charset) == 2)
5577                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5578               else if (CHARSET_DIMENSION (charset) == 3)
5579                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5580               else
5581                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5582                                  (code >> 8) & 0xFF, code & 0xFF);
5583             }
5584           else
5585             {
5586               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5587                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5588               else
5589                 c = coding->default_char;
5590               EMIT_ONE_BYTE (c);
5591             }
5592         }
5593     }
5594
5595   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5596   coding->produced_char += produced_chars;
5597   coding->produced = dst - coding->destination;
5598   return 0;
5599 }
5600
5601 \f
5602 /*** 7. C library functions ***/
5603
5604 /* Setup coding context CODING from information about CODING_SYSTEM.
5605    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5606    CODING_SYSTEM is invalid, signal an error.  */
5607
5608 void
5609 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5610 {
5611   Lisp_Object attrs;
5612   Lisp_Object eol_type;
5613   Lisp_Object coding_type;
5614   Lisp_Object val;
5615
5616   if (NILP (coding_system))
5617     coding_system = Qundecided;
5618
5619   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5620
5621   attrs = CODING_ID_ATTRS (coding->id);
5622   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5623
5624   coding->mode = 0;
5625   coding->head_ascii = -1;
5626   if (VECTORP (eol_type))
5627     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5628                             | CODING_REQUIRE_DETECTION_MASK);
5629   else if (! EQ (eol_type, Qunix))
5630     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5631                             | CODING_REQUIRE_ENCODING_MASK);
5632   else
5633     coding->common_flags = 0;
5634   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5635     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5636   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5637     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5638   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5639     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5640
5641   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5642   coding->max_charset_id = SCHARS (val) - 1;
5643   coding->safe_charsets = SDATA (val);
5644   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5645   coding->carryover_bytes = 0;
5646
5647   coding_type = CODING_ATTR_TYPE (attrs);
5648   if (EQ (coding_type, Qundecided))
5649     {
5650       coding->detector = NULL;
5651       coding->decoder = decode_coding_raw_text;
5652       coding->encoder = encode_coding_raw_text;
5653       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5654     }
5655   else if (EQ (coding_type, Qiso_2022))
5656     {
5657       int i;
5658       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5659
5660       /* Invoke graphic register 0 to plane 0.  */
5661       CODING_ISO_INVOCATION (coding, 0) = 0;
5662       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5663       CODING_ISO_INVOCATION (coding, 1)
5664         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5665       /* Setup the initial status of designation.  */
5666       for (i = 0; i < 4; i++)
5667         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5668       /* Not single shifting initially.  */
5669       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5670       /* Beginning of buffer should also be regarded as bol. */
5671       CODING_ISO_BOL (coding) = 1;
5672       coding->detector = detect_coding_iso_2022;
5673       coding->decoder = decode_coding_iso_2022;
5674       coding->encoder = encode_coding_iso_2022;
5675       if (flags & CODING_ISO_FLAG_SAFE)
5676         coding->mode |= CODING_MODE_SAFE_ENCODING;
5677       coding->common_flags
5678         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5679             | CODING_REQUIRE_FLUSHING_MASK);
5680       if (flags & CODING_ISO_FLAG_COMPOSITION)
5681         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5682       if (flags & CODING_ISO_FLAG_DESIGNATION)
5683         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5684       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5685         {
5686           setup_iso_safe_charsets (attrs);
5687           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5688           coding->max_charset_id = SCHARS (val) - 1;
5689           coding->safe_charsets = SDATA (val);
5690         }
5691       CODING_ISO_FLAGS (coding) = flags;
5692       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5693       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5694       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5695       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5696     }
5697   else if (EQ (coding_type, Qcharset))
5698     {
5699       coding->detector = detect_coding_charset;
5700       coding->decoder = decode_coding_charset;
5701       coding->encoder = encode_coding_charset;
5702       coding->common_flags
5703         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5704     }
5705   else if (EQ (coding_type, Qutf_8))
5706     {
5707       val = AREF (attrs, coding_attr_utf_bom);
5708       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5709                                    : EQ (val, Qt) ? utf_with_bom
5710                                    : utf_without_bom);
5711       coding->detector = detect_coding_utf_8;
5712       coding->decoder = decode_coding_utf_8;
5713       coding->encoder = encode_coding_utf_8;
5714       coding->common_flags
5715         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5716       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5717         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5718     }
5719   else if (EQ (coding_type, Qutf_16))
5720     {
5721       val = AREF (attrs, coding_attr_utf_bom);
5722       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5723                                     : EQ (val, Qt) ? utf_with_bom
5724                                     : utf_without_bom);
5725       val = AREF (attrs, coding_attr_utf_16_endian);
5726       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5727                                        : utf_16_little_endian);
5728       CODING_UTF_16_SURROGATE (coding) = 0;
5729       coding->detector = detect_coding_utf_16;
5730       coding->decoder = decode_coding_utf_16;
5731       coding->encoder = encode_coding_utf_16;
5732       coding->common_flags
5733         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5734       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5735         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5736     }
5737   else if (EQ (coding_type, Qccl))
5738     {
5739       coding->detector = detect_coding_ccl;
5740       coding->decoder = decode_coding_ccl;
5741       coding->encoder = encode_coding_ccl;
5742       coding->common_flags
5743         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5744             | CODING_REQUIRE_FLUSHING_MASK);
5745     }
5746   else if (EQ (coding_type, Qemacs_mule))
5747     {
5748       coding->detector = detect_coding_emacs_mule;
5749       coding->decoder = decode_coding_emacs_mule;
5750       coding->encoder = encode_coding_emacs_mule;
5751       coding->common_flags
5752         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5753       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5754           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5755         {
5756           Lisp_Object tail, safe_charsets;
5757           int max_charset_id = 0;
5758
5759           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5760                tail = XCDR (tail))
5761             if (max_charset_id < XFASTINT (XCAR (tail)))
5762               max_charset_id = XFASTINT (XCAR (tail));
5763           safe_charsets = make_uninit_string (max_charset_id + 1);
5764           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5765           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5766                tail = XCDR (tail))
5767             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5768           coding->max_charset_id = max_charset_id;
5769           coding->safe_charsets = SDATA (safe_charsets);
5770         }
5771       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5772       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5773     }
5774   else if (EQ (coding_type, Qshift_jis))
5775     {
5776       coding->detector = detect_coding_sjis;
5777       coding->decoder = decode_coding_sjis;
5778       coding->encoder = encode_coding_sjis;
5779       coding->common_flags
5780         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5781     }
5782   else if (EQ (coding_type, Qbig5))
5783     {
5784       coding->detector = detect_coding_big5;
5785       coding->decoder = decode_coding_big5;
5786       coding->encoder = encode_coding_big5;
5787       coding->common_flags
5788         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5789     }
5790   else                          /* EQ (coding_type, Qraw_text) */
5791     {
5792       coding->detector = NULL;
5793       coding->decoder = decode_coding_raw_text;
5794       coding->encoder = encode_coding_raw_text;
5795       if (! EQ (eol_type, Qunix))
5796         {
5797           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5798           if (! VECTORP (eol_type))
5799             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5800         }
5801
5802     }
5803
5804   return;
5805 }
5806
5807 /* Return a list of charsets supported by CODING.  */
5808
5809 Lisp_Object
5810 coding_charset_list (struct coding_system *coding)
5811 {
5812   Lisp_Object attrs, charset_list;
5813
5814   CODING_GET_INFO (coding, attrs, charset_list);
5815   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5816     {
5817       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5818
5819       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5820         charset_list = Viso_2022_charset_list;
5821     }
5822   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5823     {
5824       charset_list = Vemacs_mule_charset_list;
5825     }
5826   return charset_list;
5827 }
5828
5829
5830 /* Return a list of charsets supported by CODING-SYSTEM.  */
5831
5832 Lisp_Object
5833 coding_system_charset_list (Lisp_Object coding_system)
5834 {
5835   ptrdiff_t id;
5836   Lisp_Object attrs, charset_list;
5837
5838   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5839   attrs = CODING_ID_ATTRS (id);
5840
5841   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5842     {
5843       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5844
5845       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5846         charset_list = Viso_2022_charset_list;
5847       else
5848         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5849     }
5850   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5851     {
5852       charset_list = Vemacs_mule_charset_list;
5853     }
5854   else
5855     {
5856       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5857     }
5858   return charset_list;
5859 }
5860
5861
5862 /* Return raw-text or one of its subsidiaries that has the same
5863    eol_type as CODING-SYSTEM.  */
5864
5865 Lisp_Object
5866 raw_text_coding_system (Lisp_Object coding_system)
5867 {
5868   Lisp_Object spec, attrs;
5869   Lisp_Object eol_type, raw_text_eol_type;
5870
5871   if (NILP (coding_system))
5872     return Qraw_text;
5873   spec = CODING_SYSTEM_SPEC (coding_system);
5874   attrs = AREF (spec, 0);
5875
5876   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5877     return coding_system;
5878
5879   eol_type = AREF (spec, 2);
5880   if (VECTORP (eol_type))
5881     return Qraw_text;
5882   spec = CODING_SYSTEM_SPEC (Qraw_text);
5883   raw_text_eol_type = AREF (spec, 2);
5884   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5885           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5886           : AREF (raw_text_eol_type, 2));
5887 }
5888
5889
5890 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5891    the subsidiary that has the same eol-spec as PARENT (if it is not
5892    nil and specifies end-of-line format) or the system's setting
5893    (system_eol_type).  */
5894
5895 Lisp_Object
5896 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5897 {
5898   Lisp_Object spec, eol_type;
5899
5900   if (NILP (coding_system))
5901     coding_system = Qraw_text;
5902   spec = CODING_SYSTEM_SPEC (coding_system);
5903   eol_type = AREF (spec, 2);
5904   if (VECTORP (eol_type))
5905     {
5906       Lisp_Object parent_eol_type;
5907
5908       if (! NILP (parent))
5909         {
5910           Lisp_Object parent_spec;
5911
5912           parent_spec = CODING_SYSTEM_SPEC (parent);
5913           parent_eol_type = AREF (parent_spec, 2);
5914           if (VECTORP (parent_eol_type))
5915             parent_eol_type = system_eol_type;
5916         }
5917       else
5918         parent_eol_type = system_eol_type;
5919       if (EQ (parent_eol_type, Qunix))
5920         coding_system = AREF (eol_type, 0);
5921       else if (EQ (parent_eol_type, Qdos))
5922         coding_system = AREF (eol_type, 1);
5923       else if (EQ (parent_eol_type, Qmac))
5924         coding_system = AREF (eol_type, 2);
5925     }
5926   return coding_system;
5927 }
5928
5929
5930 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5931    decided for writing to a process.  If not, complement them, and
5932    return a new coding system.  */
5933
5934 Lisp_Object
5935 complement_process_encoding_system (Lisp_Object coding_system)
5936 {
5937   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5938   Lisp_Object spec, attrs;
5939   int i;
5940
5941   for (i = 0; i < 3; i++)
5942     {
5943       if (i == 1)
5944         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5945       else if (i == 2)
5946         coding_system = preferred_coding_system ();
5947       spec = CODING_SYSTEM_SPEC (coding_system);
5948       if (NILP (spec))
5949         continue;
5950       attrs = AREF (spec, 0);
5951       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5952         coding_base = CODING_ATTR_BASE_NAME (attrs);
5953       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5954         eol_base = coding_system;
5955       if (! NILP (coding_base) && ! NILP (eol_base))
5956         break;
5957     }
5958
5959   if (i > 0)
5960     /* The original CODING_SYSTEM didn't specify text-conversion or
5961        eol-conversion.  Be sure that we return a fully complemented
5962        coding system.  */
5963     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5964   return coding_system;
5965 }
5966
5967
5968 /* Emacs has a mechanism to automatically detect a coding system if it
5969    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5970    it's impossible to distinguish some coding systems accurately
5971    because they use the same range of codes.  So, at first, coding
5972    systems are categorized into 7, those are:
5973
5974    o coding-category-emacs-mule
5975
5976         The category for a coding system which has the same code range
5977         as Emacs' internal format.  Assigned the coding-system (Lisp
5978         symbol) `emacs-mule' by default.
5979
5980    o coding-category-sjis
5981
5982         The category for a coding system which has the same code range
5983         as SJIS.  Assigned the coding-system (Lisp
5984         symbol) `japanese-shift-jis' by default.
5985
5986    o coding-category-iso-7
5987
5988         The category for a coding system which has the same code range
5989         as ISO2022 of 7-bit environment.  This doesn't use any locking
5990         shift and single shift functions.  This can encode/decode all
5991         charsets.  Assigned the coding-system (Lisp symbol)
5992         `iso-2022-7bit' by default.
5993
5994    o coding-category-iso-7-tight
5995
5996         Same as coding-category-iso-7 except that this can
5997         encode/decode only the specified charsets.
5998
5999    o coding-category-iso-8-1
6000
6001         The category for a coding system which has the same code range
6002         as ISO2022 of 8-bit environment and graphic plane 1 used only
6003         for DIMENSION1 charset.  This doesn't use any locking shift
6004         and single shift functions.  Assigned the coding-system (Lisp
6005         symbol) `iso-latin-1' by default.
6006
6007    o coding-category-iso-8-2
6008
6009         The category for a coding system which has the same code range
6010         as ISO2022 of 8-bit environment and graphic plane 1 used only
6011         for DIMENSION2 charset.  This doesn't use any locking shift
6012         and single shift functions.  Assigned the coding-system (Lisp
6013         symbol) `japanese-iso-8bit' by default.
6014
6015    o coding-category-iso-7-else
6016
6017         The category for a coding system which has the same code range
6018         as ISO2022 of 7-bit environment but uses locking shift or
6019         single shift functions.  Assigned the coding-system (Lisp
6020         symbol) `iso-2022-7bit-lock' by default.
6021
6022    o coding-category-iso-8-else
6023
6024         The category for a coding system which has the same code range
6025         as ISO2022 of 8-bit environment but uses locking shift or
6026         single shift functions.  Assigned the coding-system (Lisp
6027         symbol) `iso-2022-8bit-ss2' by default.
6028
6029    o coding-category-big5
6030
6031         The category for a coding system which has the same code range
6032         as BIG5.  Assigned the coding-system (Lisp symbol)
6033         `cn-big5' by default.
6034
6035    o coding-category-utf-8
6036
6037         The category for a coding system which has the same code range
6038         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6039         symbol) `utf-8' by default.
6040
6041    o coding-category-utf-16-be
6042
6043         The category for a coding system in which a text has an
6044         Unicode signature (cf. Unicode Standard) in the order of BIG
6045         endian at the head.  Assigned the coding-system (Lisp symbol)
6046         `utf-16-be' by default.
6047
6048    o coding-category-utf-16-le
6049
6050         The category for a coding system in which a text has an
6051         Unicode signature (cf. Unicode Standard) in the order of
6052         LITTLE endian at the head.  Assigned the coding-system (Lisp
6053         symbol) `utf-16-le' by default.
6054
6055    o coding-category-ccl
6056
6057         The category for a coding system of which encoder/decoder is
6058         written in CCL programs.  The default value is nil, i.e., no
6059         coding system is assigned.
6060
6061    o coding-category-binary
6062
6063         The category for a coding system not categorized in any of the
6064         above.  Assigned the coding-system (Lisp symbol)
6065         `no-conversion' by default.
6066
6067    Each of them is a Lisp symbol and the value is an actual
6068    `coding-system's (this is also a Lisp symbol) assigned by a user.
6069    What Emacs does actually is to detect a category of coding system.
6070    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6071    decide only one possible category, it selects a category of the
6072    highest priority.  Priorities of categories are also specified by a
6073    user in a Lisp variable `coding-category-list'.
6074
6075 */
6076
6077 #define EOL_SEEN_NONE   0
6078 #define EOL_SEEN_LF     1
6079 #define EOL_SEEN_CR     2
6080 #define EOL_SEEN_CRLF   4
6081
6082 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6083    SOURCE is encoded.  If CATEGORY is one of
6084    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6085    two-byte, else they are encoded by one-byte.
6086
6087    Return one of EOL_SEEN_XXX.  */
6088
6089 #define MAX_EOL_CHECK_COUNT 3
6090
6091 static int
6092 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6093             enum coding_category category)
6094 {
6095   const unsigned char *src = source, *src_end = src + src_bytes;
6096   unsigned char c;
6097   int total  = 0;
6098   int eol_seen = EOL_SEEN_NONE;
6099
6100   if ((1 << category) & CATEGORY_MASK_UTF_16)
6101     {
6102       bool msb = category == (coding_category_utf_16_le
6103                               | coding_category_utf_16_le_nosig);
6104       bool lsb = !msb;
6105
6106       while (src + 1 < src_end)
6107         {
6108           c = src[lsb];
6109           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6110             {
6111               int this_eol;
6112
6113               if (c == '\n')
6114                 this_eol = EOL_SEEN_LF;
6115               else if (src + 3 >= src_end
6116                        || src[msb + 2] != 0
6117                        || src[lsb + 2] != '\n')
6118                 this_eol = EOL_SEEN_CR;
6119               else
6120                 {
6121                   this_eol = EOL_SEEN_CRLF;
6122                   src += 2;
6123                 }
6124
6125               if (eol_seen == EOL_SEEN_NONE)
6126                 /* This is the first end-of-line.  */
6127                 eol_seen = this_eol;
6128               else if (eol_seen != this_eol)
6129                 {
6130                   /* The found type is different from what found before.
6131                      Allow for stray ^M characters in DOS EOL files.  */
6132                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6133                       || (eol_seen == EOL_SEEN_CRLF
6134                           && this_eol == EOL_SEEN_CR))
6135                     eol_seen = EOL_SEEN_CRLF;
6136                   else
6137                     {
6138                       eol_seen = EOL_SEEN_LF;
6139                       break;
6140                     }
6141                 }
6142               if (++total == MAX_EOL_CHECK_COUNT)
6143                 break;
6144             }
6145           src += 2;
6146         }
6147     }
6148   else
6149     while (src < src_end)
6150       {
6151         c = *src++;
6152         if (c == '\n' || c == '\r')
6153           {
6154             int this_eol;
6155
6156             if (c == '\n')
6157               this_eol = EOL_SEEN_LF;
6158             else if (src >= src_end || *src != '\n')
6159               this_eol = EOL_SEEN_CR;
6160             else
6161               this_eol = EOL_SEEN_CRLF, src++;
6162
6163             if (eol_seen == EOL_SEEN_NONE)
6164               /* This is the first end-of-line.  */
6165               eol_seen = this_eol;
6166             else if (eol_seen != this_eol)
6167               {
6168                 /* The found type is different from what found before.
6169                    Allow for stray ^M characters in DOS EOL files.  */
6170                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6171                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6172                   eol_seen = EOL_SEEN_CRLF;
6173                 else
6174                   {
6175                     eol_seen = EOL_SEEN_LF;
6176                     break;
6177                   }
6178               }
6179             if (++total == MAX_EOL_CHECK_COUNT)
6180               break;
6181           }
6182       }
6183   return eol_seen;
6184 }
6185
6186
6187 static Lisp_Object
6188 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6189 {
6190   Lisp_Object eol_type;
6191
6192   eol_type = CODING_ID_EOL_TYPE (coding->id);
6193   if (eol_seen & EOL_SEEN_LF)
6194     {
6195       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6196       eol_type = Qunix;
6197     }
6198   else if (eol_seen & EOL_SEEN_CRLF)
6199     {
6200       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6201       eol_type = Qdos;
6202     }
6203   else if (eol_seen & EOL_SEEN_CR)
6204     {
6205       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6206       eol_type = Qmac;
6207     }
6208   return eol_type;
6209 }
6210
6211 /* Detect how a text specified in CODING is encoded.  If a coding
6212    system is detected, update fields of CODING by the detected coding
6213    system.  */
6214
6215 static void
6216 detect_coding (struct coding_system *coding)
6217 {
6218   const unsigned char *src, *src_end;
6219   unsigned int saved_mode = coding->mode;
6220
6221   coding->consumed = coding->consumed_char = 0;
6222   coding->produced = coding->produced_char = 0;
6223   coding_set_source (coding);
6224
6225   src_end = coding->source + coding->src_bytes;
6226   coding->head_ascii = 0;
6227
6228   /* If we have not yet decided the text encoding type, detect it
6229      now.  */
6230   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6231     {
6232       int c, i;
6233       struct coding_detection_info detect_info;
6234       bool null_byte_found = 0, eight_bit_found = 0;
6235
6236       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6237       for (src = coding->source; src < src_end; src++)
6238         {
6239           c = *src;
6240           if (c & 0x80)
6241             {
6242               eight_bit_found = 1;
6243               if (null_byte_found)
6244                 break;
6245             }
6246           else if (c < 0x20)
6247             {
6248               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6249                   && ! inhibit_iso_escape_detection
6250                   && ! detect_info.checked)
6251                 {
6252                   if (detect_coding_iso_2022 (coding, &detect_info))
6253                     {
6254                       /* We have scanned the whole data.  */
6255                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6256                         {
6257                           /* We didn't find an 8-bit code.  We may
6258                              have found a null-byte, but it's very
6259                              rare that a binary file conforms to
6260                              ISO-2022.  */
6261                           src = src_end;
6262                           coding->head_ascii = src - coding->source;
6263                         }
6264                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6265                       break;
6266                     }
6267                 }
6268               else if (! c && !inhibit_null_byte_detection)
6269                 {
6270                   null_byte_found = 1;
6271                   if (eight_bit_found)
6272                     break;
6273                 }
6274               if (! eight_bit_found)
6275                 coding->head_ascii++;
6276             }
6277           else if (! eight_bit_found)
6278             coding->head_ascii++;
6279         }
6280
6281       if (null_byte_found || eight_bit_found
6282           || coding->head_ascii < coding->src_bytes
6283           || detect_info.found)
6284         {
6285           enum coding_category category;
6286           struct coding_system *this;
6287
6288           if (coding->head_ascii == coding->src_bytes)
6289             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6290             for (i = 0; i < coding_category_raw_text; i++)
6291               {
6292                 category = coding_priorities[i];
6293                 this = coding_categories + category;
6294                 if (detect_info.found & (1 << category))
6295                   break;
6296               }
6297           else
6298             {
6299               if (null_byte_found)
6300                 {
6301                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6302                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6303                 }
6304               for (i = 0; i < coding_category_raw_text; i++)
6305                 {
6306                   category = coding_priorities[i];
6307                   this = coding_categories + category;
6308                   /* Some of this->detector (e.g. detect_coding_sjis)
6309                      require this information.  */
6310                   coding->id = this->id;
6311                   if (this->id < 0)
6312                     {
6313                       /* No coding system of this category is defined.  */
6314                       detect_info.rejected |= (1 << category);
6315                     }
6316                   else if (category >= coding_category_raw_text)
6317                     continue;
6318                   else if (detect_info.checked & (1 << category))
6319                     {
6320                       if (detect_info.found & (1 << category))
6321                         break;
6322                     }
6323                   else if ((*(this->detector)) (coding, &detect_info)
6324                            && detect_info.found & (1 << category))
6325                     {
6326                       if (category == coding_category_utf_16_auto)
6327                         {
6328                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6329                             category = coding_category_utf_16_le;
6330                           else
6331                             category = coding_category_utf_16_be;
6332                         }
6333                       break;
6334                     }
6335                 }
6336             }
6337
6338           if (i < coding_category_raw_text)
6339             setup_coding_system (CODING_ID_NAME (this->id), coding);
6340           else if (null_byte_found)
6341             setup_coding_system (Qno_conversion, coding);
6342           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6343                    == CATEGORY_MASK_ANY)
6344             setup_coding_system (Qraw_text, coding);
6345           else if (detect_info.rejected)
6346             for (i = 0; i < coding_category_raw_text; i++)
6347               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6348                 {
6349                   this = coding_categories + coding_priorities[i];
6350                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6351                   break;
6352                 }
6353         }
6354     }
6355   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6356            == coding_category_utf_8_auto)
6357     {
6358       Lisp_Object coding_systems;
6359       struct coding_detection_info detect_info;
6360
6361       coding_systems
6362         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6363       detect_info.found = detect_info.rejected = 0;
6364       coding->head_ascii = 0;
6365       if (CONSP (coding_systems)
6366           && detect_coding_utf_8 (coding, &detect_info))
6367         {
6368           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6369             setup_coding_system (XCAR (coding_systems), coding);
6370           else
6371             setup_coding_system (XCDR (coding_systems), coding);
6372         }
6373     }
6374   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6375            == coding_category_utf_16_auto)
6376     {
6377       Lisp_Object coding_systems;
6378       struct coding_detection_info detect_info;
6379
6380       coding_systems
6381         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6382       detect_info.found = detect_info.rejected = 0;
6383       coding->head_ascii = 0;
6384       if (CONSP (coding_systems)
6385           && detect_coding_utf_16 (coding, &detect_info))
6386         {
6387           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6388             setup_coding_system (XCAR (coding_systems), coding);
6389           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6390             setup_coding_system (XCDR (coding_systems), coding);
6391         }
6392     }
6393   coding->mode = saved_mode;
6394 }
6395
6396
6397 static void
6398 decode_eol (struct coding_system *coding)
6399 {
6400   Lisp_Object eol_type;
6401   unsigned char *p, *pbeg, *pend;
6402
6403   eol_type = CODING_ID_EOL_TYPE (coding->id);
6404   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6405     return;
6406
6407   if (NILP (coding->dst_object))
6408     pbeg = coding->destination;
6409   else
6410     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6411   pend = pbeg + coding->produced;
6412
6413   if (VECTORP (eol_type))
6414     {
6415       int eol_seen = EOL_SEEN_NONE;
6416
6417       for (p = pbeg; p < pend; p++)
6418         {
6419           if (*p == '\n')
6420             eol_seen |= EOL_SEEN_LF;
6421           else if (*p == '\r')
6422             {
6423               if (p + 1 < pend && *(p + 1) == '\n')
6424                 {
6425                   eol_seen |= EOL_SEEN_CRLF;
6426                   p++;
6427                 }
6428               else
6429                 eol_seen |= EOL_SEEN_CR;
6430             }
6431         }
6432       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6433       if ((eol_seen & EOL_SEEN_CRLF) != 0
6434           && (eol_seen & EOL_SEEN_CR) != 0
6435           && (eol_seen & EOL_SEEN_LF) == 0)
6436         eol_seen = EOL_SEEN_CRLF;
6437       else if (eol_seen != EOL_SEEN_NONE
6438           && eol_seen != EOL_SEEN_LF
6439           && eol_seen != EOL_SEEN_CRLF
6440           && eol_seen != EOL_SEEN_CR)
6441         eol_seen = EOL_SEEN_LF;
6442       if (eol_seen != EOL_SEEN_NONE)
6443         eol_type = adjust_coding_eol_type (coding, eol_seen);
6444     }
6445
6446   if (EQ (eol_type, Qmac))
6447     {
6448       for (p = pbeg; p < pend; p++)
6449         if (*p == '\r')
6450           *p = '\n';
6451     }
6452   else if (EQ (eol_type, Qdos))
6453     {
6454       ptrdiff_t n = 0;
6455
6456       if (NILP (coding->dst_object))
6457         {
6458           /* Start deleting '\r' from the tail to minimize the memory
6459              movement.  */
6460           for (p = pend - 2; p >= pbeg; p--)
6461             if (*p == '\r')
6462               {
6463                 memmove (p, p + 1, pend-- - p - 1);
6464                 n++;
6465               }
6466         }
6467       else
6468         {
6469           ptrdiff_t pos_byte = coding->dst_pos_byte;
6470           ptrdiff_t pos = coding->dst_pos;
6471           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6472
6473           while (pos < pos_end)
6474             {
6475               p = BYTE_POS_ADDR (pos_byte);
6476               if (*p == '\r' && p[1] == '\n')
6477                 {
6478                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6479                   n++;
6480                   pos_end--;
6481                 }
6482               pos++;
6483               if (coding->dst_multibyte)
6484                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6485               else
6486                 pos_byte++;
6487             }
6488         }
6489       coding->produced -= n;
6490       coding->produced_char -= n;
6491     }
6492 }
6493
6494
6495 /* Return a translation table (or list of them) from coding system
6496    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6497    not ENCODEP). */
6498
6499 static Lisp_Object
6500 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6501 {
6502   Lisp_Object standard, translation_table;
6503   Lisp_Object val;
6504
6505   if (NILP (Venable_character_translation))
6506     {
6507       if (max_lookup)
6508         *max_lookup = 0;
6509       return Qnil;
6510     }
6511   if (encodep)
6512     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6513       standard = Vstandard_translation_table_for_encode;
6514   else
6515     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6516       standard = Vstandard_translation_table_for_decode;
6517   if (NILP (translation_table))
6518     translation_table = standard;
6519   else
6520     {
6521       if (SYMBOLP (translation_table))
6522         translation_table = Fget (translation_table, Qtranslation_table);
6523       else if (CONSP (translation_table))
6524         {
6525           translation_table = Fcopy_sequence (translation_table);
6526           for (val = translation_table; CONSP (val); val = XCDR (val))
6527             if (SYMBOLP (XCAR (val)))
6528               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6529         }
6530       if (CHAR_TABLE_P (standard))
6531         {
6532           if (CONSP (translation_table))
6533             translation_table = nconc2 (translation_table,
6534                                         Fcons (standard, Qnil));
6535           else
6536             translation_table = Fcons (translation_table,
6537                                        Fcons (standard, Qnil));
6538         }
6539     }
6540
6541   if (max_lookup)
6542     {
6543       *max_lookup = 1;
6544       if (CHAR_TABLE_P (translation_table)
6545           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6546         {
6547           val = XCHAR_TABLE (translation_table)->extras[1];
6548           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6549             *max_lookup = XFASTINT (val);
6550         }
6551       else if (CONSP (translation_table))
6552         {
6553           Lisp_Object tail;
6554
6555           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6556             if (CHAR_TABLE_P (XCAR (tail))
6557                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6558               {
6559                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6560                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6561                   *max_lookup = XFASTINT (tailval);
6562               }
6563         }
6564     }
6565   return translation_table;
6566 }
6567
6568 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6569   do {                                                          \
6570     trans = Qnil;                                               \
6571     if (CHAR_TABLE_P (table))                                   \
6572       {                                                         \
6573         trans = CHAR_TABLE_REF (table, c);                      \
6574         if (CHARACTERP (trans))                                 \
6575           c = XFASTINT (trans), trans = Qnil;                   \
6576       }                                                         \
6577     else if (CONSP (table))                                     \
6578       {                                                         \
6579         Lisp_Object tail;                                       \
6580                                                                 \
6581         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6582           if (CHAR_TABLE_P (XCAR (tail)))                       \
6583             {                                                   \
6584               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6585               if (CHARACTERP (trans))                           \
6586                 c = XFASTINT (trans), trans = Qnil;             \
6587               else if (! NILP (trans))                          \
6588                 break;                                          \
6589             }                                                   \
6590       }                                                         \
6591   } while (0)
6592
6593
6594 /* Return a translation of character(s) at BUF according to TRANS.
6595    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6596    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6597    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6598    translation is found, and Qnil if not found..
6599    If BUF is too short to lookup characters in FROM, return Qt.  */
6600
6601 static Lisp_Object
6602 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6603 {
6604
6605   if (INTEGERP (trans))
6606     return trans;
6607   for (; CONSP (trans); trans = XCDR (trans))
6608     {
6609       Lisp_Object val = XCAR (trans);
6610       Lisp_Object from = XCAR (val);
6611       ptrdiff_t len = ASIZE (from);
6612       ptrdiff_t i;
6613
6614       for (i = 0; i < len; i++)
6615         {
6616           if (buf + i == buf_end)
6617             return Qt;
6618           if (XINT (AREF (from, i)) != buf[i])
6619             break;
6620         }
6621       if (i == len)
6622         return val;
6623     }
6624   return Qnil;
6625 }
6626
6627
6628 static int
6629 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6630                bool last_block)
6631 {
6632   unsigned char *dst = coding->destination + coding->produced;
6633   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6634   ptrdiff_t produced;
6635   ptrdiff_t produced_chars = 0;
6636   int carryover = 0;
6637
6638   if (! coding->chars_at_source)
6639     {
6640       /* Source characters are in coding->charbuf.  */
6641       int *buf = coding->charbuf;
6642       int *buf_end = buf + coding->charbuf_used;
6643
6644       if (EQ (coding->src_object, coding->dst_object))
6645         {
6646           coding_set_source (coding);
6647           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6648         }
6649
6650       while (buf < buf_end)
6651         {
6652           int c = *buf;
6653           ptrdiff_t i;
6654
6655           if (c >= 0)
6656             {
6657               ptrdiff_t from_nchars = 1, to_nchars = 1;
6658               Lisp_Object trans = Qnil;
6659
6660               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6661               if (! NILP (trans))
6662                 {
6663                   trans = get_translation (trans, buf, buf_end);
6664                   if (INTEGERP (trans))
6665                     c = XINT (trans);
6666                   else if (CONSP (trans))
6667                     {
6668                       from_nchars = ASIZE (XCAR (trans));
6669                       trans = XCDR (trans);
6670                       if (INTEGERP (trans))
6671                         c = XINT (trans);
6672                       else
6673                         {
6674                           to_nchars = ASIZE (trans);
6675                           c = XINT (AREF (trans, 0));
6676                         }
6677                     }
6678                   else if (EQ (trans, Qt) && ! last_block)
6679                     break;
6680                 }
6681
6682               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6683                 {
6684                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6685                        / MAX_MULTIBYTE_LENGTH)
6686                       < to_nchars)
6687                     memory_full (SIZE_MAX);
6688                   dst = alloc_destination (coding,
6689                                            buf_end - buf
6690                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6691                                            dst);
6692                   if (EQ (coding->src_object, coding->dst_object))
6693                     {
6694                       coding_set_source (coding);
6695                       dst_end = (((unsigned char *) coding->source)
6696                                  + coding->consumed);
6697                     }
6698                   else
6699                     dst_end = coding->destination + coding->dst_bytes;
6700                 }
6701
6702               for (i = 0; i < to_nchars; i++)
6703                 {
6704                   if (i > 0)
6705                     c = XINT (AREF (trans, i));
6706                   if (coding->dst_multibyte
6707                       || ! CHAR_BYTE8_P (c))
6708                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6709                   else
6710                     *dst++ = CHAR_TO_BYTE8 (c);
6711                 }
6712               produced_chars += to_nchars;
6713               buf += from_nchars;
6714             }
6715           else
6716             /* This is an annotation datum.  (-C) is the length.  */
6717             buf += -c;
6718         }
6719       carryover = buf_end - buf;
6720     }
6721   else
6722     {
6723       /* Source characters are at coding->source.  */
6724       const unsigned char *src = coding->source;
6725       const unsigned char *src_end = src + coding->consumed;
6726
6727       if (EQ (coding->dst_object, coding->src_object))
6728         dst_end = (unsigned char *) src;
6729       if (coding->src_multibyte != coding->dst_multibyte)
6730         {
6731           if (coding->src_multibyte)
6732             {
6733               bool multibytep = 1;
6734               ptrdiff_t consumed_chars = 0;
6735
6736               while (1)
6737                 {
6738                   const unsigned char *src_base = src;
6739                   int c;
6740
6741                   ONE_MORE_BYTE (c);
6742                   if (dst == dst_end)
6743                     {
6744                       if (EQ (coding->src_object, coding->dst_object))
6745                         dst_end = (unsigned char *) src;
6746                       if (dst == dst_end)
6747                         {
6748                           ptrdiff_t offset = src - coding->source;
6749
6750                           dst = alloc_destination (coding, src_end - src + 1,
6751                                                    dst);
6752                           dst_end = coding->destination + coding->dst_bytes;
6753                           coding_set_source (coding);
6754                           src = coding->source + offset;
6755                           src_end = coding->source + coding->consumed;
6756                           if (EQ (coding->src_object, coding->dst_object))
6757                             dst_end = (unsigned char *) src;
6758                         }
6759                     }
6760                   *dst++ = c;
6761                   produced_chars++;
6762                 }
6763             no_more_source:
6764               ;
6765             }
6766           else
6767             while (src < src_end)
6768               {
6769                 bool multibytep = 1;
6770                 int c = *src++;
6771
6772                 if (dst >= dst_end - 1)
6773                   {
6774                     if (EQ (coding->src_object, coding->dst_object))
6775                       dst_end = (unsigned char *) src;
6776                     if (dst >= dst_end - 1)
6777                       {
6778                         ptrdiff_t offset = src - coding->source;
6779                         ptrdiff_t more_bytes;
6780
6781                         if (EQ (coding->src_object, coding->dst_object))
6782                           more_bytes = ((src_end - src) / 2) + 2;
6783                         else
6784                           more_bytes = src_end - src + 2;
6785                         dst = alloc_destination (coding, more_bytes, dst);
6786                         dst_end = coding->destination + coding->dst_bytes;
6787                         coding_set_source (coding);
6788                         src = coding->source + offset;
6789                         src_end = coding->source + coding->consumed;
6790                         if (EQ (coding->src_object, coding->dst_object))
6791                           dst_end = (unsigned char *) src;
6792                       }
6793                   }
6794                 EMIT_ONE_BYTE (c);
6795               }
6796         }
6797       else
6798         {
6799           if (!EQ (coding->src_object, coding->dst_object))
6800             {
6801               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6802
6803               if (require > 0)
6804                 {
6805                   ptrdiff_t offset = src - coding->source;
6806
6807                   dst = alloc_destination (coding, require, dst);
6808                   coding_set_source (coding);
6809                   src = coding->source + offset;
6810                   src_end = coding->source + coding->consumed;
6811                 }
6812             }
6813           produced_chars = coding->consumed_char;
6814           while (src < src_end)
6815             *dst++ = *src++;
6816         }
6817     }
6818
6819   produced = dst - (coding->destination + coding->produced);
6820   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6821     insert_from_gap (produced_chars, produced);
6822   coding->produced += produced;
6823   coding->produced_char += produced_chars;
6824   return carryover;
6825 }
6826
6827 /* Compose text in CODING->object according to the annotation data at
6828    CHARBUF.  CHARBUF is an array:
6829      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6830  */
6831
6832 static void
6833 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6834 {
6835   int len;
6836   ptrdiff_t to;
6837   enum composition_method method;
6838   Lisp_Object components;
6839
6840   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6841   to = pos + charbuf[2];
6842   method = (enum composition_method) (charbuf[4]);
6843
6844   if (method == COMPOSITION_RELATIVE)
6845     components = Qnil;
6846   else
6847     {
6848       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6849       int i, j;
6850
6851       if (method == COMPOSITION_WITH_RULE)
6852         len = charbuf[2] * 3 - 2;
6853       charbuf += MAX_ANNOTATION_LENGTH;
6854       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6855       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6856         {
6857           if (charbuf[i] >= 0)
6858             args[j] = make_number (charbuf[i]);
6859           else
6860             {
6861               i++;
6862               args[j] = make_number (charbuf[i] % 0x100);
6863             }
6864         }
6865       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6866     }
6867   compose_text (pos, to, components, Qnil, coding->dst_object);
6868 }
6869
6870
6871 /* Put `charset' property on text in CODING->object according to
6872    the annotation data at CHARBUF.  CHARBUF is an array:
6873      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6874  */
6875
6876 static void
6877 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6878 {
6879   ptrdiff_t from = pos - charbuf[2];
6880   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6881
6882   Fput_text_property (make_number (from), make_number (pos),
6883                       Qcharset, CHARSET_NAME (charset),
6884                       coding->dst_object);
6885 }
6886
6887
6888 #define CHARBUF_SIZE 0x4000
6889
6890 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6891   do {                                                                  \
6892     int size = CHARBUF_SIZE;                                            \
6893                                                                         \
6894     coding->charbuf = NULL;                                             \
6895     while (size > 1024)                                                 \
6896       {                                                                 \
6897         coding->charbuf = alloca (sizeof (int) * size);                 \
6898         if (coding->charbuf)                                            \
6899           break;                                                        \
6900         size >>= 1;                                                     \
6901       }                                                                 \
6902     if (! coding->charbuf)                                              \
6903       {                                                                 \
6904         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6905         return;                                                         \
6906       }                                                                 \
6907     coding->charbuf_size = size;                                        \
6908   } while (0)
6909
6910
6911 static void
6912 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6913 {
6914   int *charbuf = coding->charbuf;
6915   int *charbuf_end = charbuf + coding->charbuf_used;
6916
6917   if (NILP (coding->dst_object))
6918     return;
6919
6920   while (charbuf < charbuf_end)
6921     {
6922       if (*charbuf >= 0)
6923         pos++, charbuf++;
6924       else
6925         {
6926           int len = -*charbuf;
6927
6928           if (len > 2)
6929             switch (charbuf[1])
6930               {
6931               case CODING_ANNOTATE_COMPOSITION_MASK:
6932                 produce_composition (coding, charbuf, pos);
6933                 break;
6934               case CODING_ANNOTATE_CHARSET_MASK:
6935                 produce_charset (coding, charbuf, pos);
6936                 break;
6937               }
6938           charbuf += len;
6939         }
6940     }
6941 }
6942
6943 /* Decode the data at CODING->src_object into CODING->dst_object.
6944    CODING->src_object is a buffer, a string, or nil.
6945    CODING->dst_object is a buffer.
6946
6947    If CODING->src_object is a buffer, it must be the current buffer.
6948    In this case, if CODING->src_pos is positive, it is a position of
6949    the source text in the buffer, otherwise, the source text is in the
6950    gap area of the buffer, and CODING->src_pos specifies the offset of
6951    the text from GPT (which must be the same as PT).  If this is the
6952    same buffer as CODING->dst_object, CODING->src_pos must be
6953    negative.
6954
6955    If CODING->src_object is a string, CODING->src_pos is an index to
6956    that string.
6957
6958    If CODING->src_object is nil, CODING->source must already point to
6959    the non-relocatable memory area.  In this case, CODING->src_pos is
6960    an offset from CODING->source.
6961
6962    The decoded data is inserted at the current point of the buffer
6963    CODING->dst_object.
6964 */
6965
6966 static void
6967 decode_coding (struct coding_system *coding)
6968 {
6969   Lisp_Object attrs;
6970   Lisp_Object undo_list;
6971   Lisp_Object translation_table;
6972   struct ccl_spec cclspec;
6973   int carryover;
6974   int i;
6975
6976   if (BUFFERP (coding->src_object)
6977       && coding->src_pos > 0
6978       && coding->src_pos < GPT
6979       && coding->src_pos + coding->src_chars > GPT)
6980     move_gap_both (coding->src_pos, coding->src_pos_byte);
6981
6982   undo_list = Qt;
6983   if (BUFFERP (coding->dst_object))
6984     {
6985       set_buffer_internal (XBUFFER (coding->dst_object));
6986       if (GPT != PT)
6987         move_gap_both (PT, PT_BYTE);
6988
6989       /* We must disable undo_list in order to record the whole insert
6990          transaction via record_insert at the end.  But doing so also
6991          disables the recording of the first change to the undo_list.
6992          Therefore we check for first change here and record it via
6993          record_first_change if needed.  */
6994       if (MODIFF <= SAVE_MODIFF)
6995         record_first_change ();
6996
6997       undo_list = BVAR (current_buffer, undo_list);
6998       bset_undo_list (current_buffer, Qt);
6999     }
7000
7001   coding->consumed = coding->consumed_char = 0;
7002   coding->produced = coding->produced_char = 0;
7003   coding->chars_at_source = 0;
7004   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7005   coding->errors = 0;
7006
7007   ALLOC_CONVERSION_WORK_AREA (coding);
7008
7009   attrs = CODING_ID_ATTRS (coding->id);
7010   translation_table = get_translation_table (attrs, 0, NULL);
7011
7012   carryover = 0;
7013   if (coding->decoder == decode_coding_ccl)
7014     {
7015       coding->spec.ccl = &cclspec;
7016       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7017     }
7018   do
7019     {
7020       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7021
7022       coding_set_source (coding);
7023       coding->annotated = 0;
7024       coding->charbuf_used = carryover;
7025       (*(coding->decoder)) (coding);
7026       coding_set_destination (coding);
7027       carryover = produce_chars (coding, translation_table, 0);
7028       if (coding->annotated)
7029         produce_annotation (coding, pos);
7030       for (i = 0; i < carryover; i++)
7031         coding->charbuf[i]
7032           = coding->charbuf[coding->charbuf_used - carryover + i];
7033     }
7034   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7035          || (coding->consumed < coding->src_bytes
7036              && (coding->result == CODING_RESULT_SUCCESS
7037                  || coding->result == CODING_RESULT_INVALID_SRC)));
7038
7039   if (carryover > 0)
7040     {
7041       coding_set_destination (coding);
7042       coding->charbuf_used = carryover;
7043       produce_chars (coding, translation_table, 1);
7044     }
7045
7046   coding->carryover_bytes = 0;
7047   if (coding->consumed < coding->src_bytes)
7048     {
7049       int nbytes = coding->src_bytes - coding->consumed;
7050       const unsigned char *src;
7051
7052       coding_set_source (coding);
7053       coding_set_destination (coding);
7054       src = coding->source + coding->consumed;
7055
7056       if (coding->mode & CODING_MODE_LAST_BLOCK)
7057         {
7058           /* Flush out unprocessed data as binary chars.  We are sure
7059              that the number of data is less than the size of
7060              coding->charbuf.  */
7061           coding->charbuf_used = 0;
7062           coding->chars_at_source = 0;
7063
7064           while (nbytes-- > 0)
7065             {
7066               int c = *src++;
7067
7068               if (c & 0x80)
7069                 c = BYTE8_TO_CHAR (c);
7070               coding->charbuf[coding->charbuf_used++] = c;
7071             }
7072           produce_chars (coding, Qnil, 1);
7073         }
7074       else
7075         {
7076           /* Record unprocessed bytes in coding->carryover.  We are
7077              sure that the number of data is less than the size of
7078              coding->carryover.  */
7079           unsigned char *p = coding->carryover;
7080
7081           if (nbytes > sizeof coding->carryover)
7082             nbytes = sizeof coding->carryover;
7083           coding->carryover_bytes = nbytes;
7084           while (nbytes-- > 0)
7085             *p++ = *src++;
7086         }
7087       coding->consumed = coding->src_bytes;
7088     }
7089
7090   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7091       && !inhibit_eol_conversion)
7092     decode_eol (coding);
7093   if (BUFFERP (coding->dst_object))
7094     {
7095       bset_undo_list (current_buffer, undo_list);
7096       record_insert (coding->dst_pos, coding->produced_char);
7097     }
7098 }
7099
7100
7101 /* Extract an annotation datum from a composition starting at POS and
7102    ending before LIMIT of CODING->src_object (buffer or string), store
7103    the data in BUF, set *STOP to a starting position of the next
7104    composition (if any) or to LIMIT, and return the address of the
7105    next element of BUF.
7106
7107    If such an annotation is not found, set *STOP to a starting
7108    position of a composition after POS (if any) or to LIMIT, and
7109    return BUF.  */
7110
7111 static int *
7112 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7113                                struct coding_system *coding, int *buf,
7114                                ptrdiff_t *stop)
7115 {
7116   ptrdiff_t start, end;
7117   Lisp_Object prop;
7118
7119   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7120       || end > limit)
7121     *stop = limit;
7122   else if (start > pos)
7123     *stop = start;
7124   else
7125     {
7126       if (start == pos)
7127         {
7128           /* We found a composition.  Store the corresponding
7129              annotation data in BUF.  */
7130           int *head = buf;
7131           enum composition_method method = COMPOSITION_METHOD (prop);
7132           int nchars = COMPOSITION_LENGTH (prop);
7133
7134           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7135           if (method != COMPOSITION_RELATIVE)
7136             {
7137               Lisp_Object components;
7138               ptrdiff_t i, len, i_byte;
7139
7140               components = COMPOSITION_COMPONENTS (prop);
7141               if (VECTORP (components))
7142                 {
7143                   len = ASIZE (components);
7144                   for (i = 0; i < len; i++)
7145                     *buf++ = XINT (AREF (components, i));
7146                 }
7147               else if (STRINGP (components))
7148                 {
7149                   len = SCHARS (components);
7150                   i = i_byte = 0;
7151                   while (i < len)
7152                     {
7153                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7154                       buf++;
7155                     }
7156                 }
7157               else if (INTEGERP (components))
7158                 {
7159                   len = 1;
7160                   *buf++ = XINT (components);
7161                 }
7162               else if (CONSP (components))
7163                 {
7164                   for (len = 0; CONSP (components);
7165                        len++, components = XCDR (components))
7166                     *buf++ = XINT (XCAR (components));
7167                 }
7168               else
7169                 emacs_abort ();
7170               *head -= len;
7171             }
7172         }
7173
7174       if (find_composition (end, limit, &start, &end, &prop,
7175                             coding->src_object)
7176           && end <= limit)
7177         *stop = start;
7178       else
7179         *stop = limit;
7180     }
7181   return buf;
7182 }
7183
7184
7185 /* Extract an annotation datum from a text property `charset' at POS of
7186    CODING->src_object (buffer of string), store the data in BUF, set
7187    *STOP to the position where the value of `charset' property changes
7188    (limiting by LIMIT), and return the address of the next element of
7189    BUF.
7190
7191    If the property value is nil, set *STOP to the position where the
7192    property value is non-nil (limiting by LIMIT), and return BUF.  */
7193
7194 static int *
7195 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7196                            struct coding_system *coding, int *buf,
7197                            ptrdiff_t *stop)
7198 {
7199   Lisp_Object val, next;
7200   int id;
7201
7202   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7203   if (! NILP (val) && CHARSETP (val))
7204     id = XINT (CHARSET_SYMBOL_ID (val));
7205   else
7206     id = -1;
7207   ADD_CHARSET_DATA (buf, 0, id);
7208   next = Fnext_single_property_change (make_number (pos), Qcharset,
7209                                        coding->src_object,
7210                                        make_number (limit));
7211   *stop = XINT (next);
7212   return buf;
7213 }
7214
7215
7216 static void
7217 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7218                int max_lookup)
7219 {
7220   int *buf = coding->charbuf;
7221   int *buf_end = coding->charbuf + coding->charbuf_size;
7222   const unsigned char *src = coding->source + coding->consumed;
7223   const unsigned char *src_end = coding->source + coding->src_bytes;
7224   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7225   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7226   bool multibytep = coding->src_multibyte;
7227   Lisp_Object eol_type;
7228   int c;
7229   ptrdiff_t stop, stop_composition, stop_charset;
7230   int *lookup_buf = NULL;
7231
7232   if (! NILP (translation_table))
7233     lookup_buf = alloca (sizeof (int) * max_lookup);
7234
7235   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7236   if (VECTORP (eol_type))
7237     eol_type = Qunix;
7238
7239   /* Note: composition handling is not yet implemented.  */
7240   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7241
7242   if (NILP (coding->src_object))
7243     stop = stop_composition = stop_charset = end_pos;
7244   else
7245     {
7246       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7247         stop = stop_composition = pos;
7248       else
7249         stop = stop_composition = end_pos;
7250       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7251         stop = stop_charset = pos;
7252       else
7253         stop_charset = end_pos;
7254     }
7255
7256   /* Compensate for CRLF and conversion.  */
7257   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7258   while (buf < buf_end)
7259     {
7260       Lisp_Object trans;
7261
7262       if (pos == stop)
7263         {
7264           if (pos == end_pos)
7265             break;
7266           if (pos == stop_composition)
7267             buf = handle_composition_annotation (pos, end_pos, coding,
7268                                                  buf, &stop_composition);
7269           if (pos == stop_charset)
7270             buf = handle_charset_annotation (pos, end_pos, coding,
7271                                              buf, &stop_charset);
7272           stop = (stop_composition < stop_charset
7273                   ? stop_composition : stop_charset);
7274         }
7275
7276       if (! multibytep)
7277         {
7278           int bytes;
7279
7280           if (coding->encoder == encode_coding_raw_text
7281               || coding->encoder == encode_coding_ccl)
7282             c = *src++, pos++;
7283           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7284             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7285           else
7286             c = BYTE8_TO_CHAR (*src), src++, pos++;
7287         }
7288       else
7289         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7290       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7291         c = '\n';
7292       if (! EQ (eol_type, Qunix))
7293         {
7294           if (c == '\n')
7295             {
7296               if (EQ (eol_type, Qdos))
7297                 *buf++ = '\r';
7298               else
7299                 c = '\r';
7300             }
7301         }
7302
7303       trans = Qnil;
7304       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7305       if (NILP (trans))
7306         *buf++ = c;
7307       else
7308         {
7309           ptrdiff_t from_nchars = 1, to_nchars = 1;
7310           int *lookup_buf_end;
7311           const unsigned char *p = src;
7312           int i;
7313
7314           lookup_buf[0] = c;
7315           for (i = 1; i < max_lookup && p < src_end; i++)
7316             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7317           lookup_buf_end = lookup_buf + i;
7318           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7319           if (INTEGERP (trans))
7320             c = XINT (trans);
7321           else if (CONSP (trans))
7322             {
7323               from_nchars = ASIZE (XCAR (trans));
7324               trans = XCDR (trans);
7325               if (INTEGERP (trans))
7326                 c = XINT (trans);
7327               else
7328                 {
7329                   to_nchars = ASIZE (trans);
7330                   if (buf_end - buf < to_nchars)
7331                     break;
7332                   c = XINT (AREF (trans, 0));
7333                 }
7334             }
7335           else
7336             break;
7337           *buf++ = c;
7338           for (i = 1; i < to_nchars; i++)
7339             *buf++ = XINT (AREF (trans, i));
7340           for (i = 1; i < from_nchars; i++, pos++)
7341             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7342         }
7343     }
7344
7345   coding->consumed = src - coding->source;
7346   coding->consumed_char = pos - coding->src_pos;
7347   coding->charbuf_used = buf - coding->charbuf;
7348   coding->chars_at_source = 0;
7349 }
7350
7351
7352 /* Encode the text at CODING->src_object into CODING->dst_object.
7353    CODING->src_object is a buffer or a string.
7354    CODING->dst_object is a buffer or nil.
7355
7356    If CODING->src_object is a buffer, it must be the current buffer.
7357    In this case, if CODING->src_pos is positive, it is a position of
7358    the source text in the buffer, otherwise. the source text is in the
7359    gap area of the buffer, and coding->src_pos specifies the offset of
7360    the text from GPT (which must be the same as PT).  If this is the
7361    same buffer as CODING->dst_object, CODING->src_pos must be
7362    negative and CODING should not have `pre-write-conversion'.
7363
7364    If CODING->src_object is a string, CODING should not have
7365    `pre-write-conversion'.
7366
7367    If CODING->dst_object is a buffer, the encoded data is inserted at
7368    the current point of that buffer.
7369
7370    If CODING->dst_object is nil, the encoded data is placed at the
7371    memory area specified by CODING->destination.  */
7372
7373 static void
7374 encode_coding (struct coding_system *coding)
7375 {
7376   Lisp_Object attrs;
7377   Lisp_Object translation_table;
7378   int max_lookup;
7379   struct ccl_spec cclspec;
7380
7381   attrs = CODING_ID_ATTRS (coding->id);
7382   if (coding->encoder == encode_coding_raw_text)
7383     translation_table = Qnil, max_lookup = 0;
7384   else
7385     translation_table = get_translation_table (attrs, 1, &max_lookup);
7386
7387   if (BUFFERP (coding->dst_object))
7388     {
7389       set_buffer_internal (XBUFFER (coding->dst_object));
7390       coding->dst_multibyte
7391         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7392     }
7393
7394   coding->consumed = coding->consumed_char = 0;
7395   coding->produced = coding->produced_char = 0;
7396   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7397   coding->errors = 0;
7398
7399   ALLOC_CONVERSION_WORK_AREA (coding);
7400
7401   if (coding->encoder == encode_coding_ccl)
7402     {
7403       coding->spec.ccl = &cclspec;
7404       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7405     }
7406   do {
7407     coding_set_source (coding);
7408     consume_chars (coding, translation_table, max_lookup);
7409     coding_set_destination (coding);
7410     (*(coding->encoder)) (coding);
7411   } while (coding->consumed_char < coding->src_chars);
7412
7413   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7414     insert_from_gap (coding->produced_char, coding->produced);
7415 }
7416
7417
7418 /* Name (or base name) of work buffer for code conversion.  */
7419 static Lisp_Object Vcode_conversion_workbuf_name;
7420
7421 /* A working buffer used by the top level conversion.  Once it is
7422    created, it is never destroyed.  It has the name
7423    Vcode_conversion_workbuf_name.  The other working buffers are
7424    destroyed after the use is finished, and their names are modified
7425    versions of Vcode_conversion_workbuf_name.  */
7426 static Lisp_Object Vcode_conversion_reused_workbuf;
7427
7428 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7429 static bool reused_workbuf_in_use;
7430
7431
7432 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7433    multibyteness of returning buffer.  */
7434
7435 static Lisp_Object
7436 make_conversion_work_buffer (bool multibyte)
7437 {
7438   Lisp_Object name, workbuf;
7439   struct buffer *current;
7440
7441   if (reused_workbuf_in_use)
7442     {
7443       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7444       workbuf = Fget_buffer_create (name);
7445     }
7446   else
7447     {
7448       reused_workbuf_in_use = 1;
7449       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7450         Vcode_conversion_reused_workbuf
7451           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7452       workbuf = Vcode_conversion_reused_workbuf;
7453     }
7454   current = current_buffer;
7455   set_buffer_internal (XBUFFER (workbuf));
7456   /* We can't allow modification hooks to run in the work buffer.  For
7457      instance, directory_files_internal assumes that file decoding
7458      doesn't compile new regexps.  */
7459   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7460   Ferase_buffer ();
7461   bset_undo_list (current_buffer, Qt);
7462   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7463   set_buffer_internal (current);
7464   return workbuf;
7465 }
7466
7467
7468 static Lisp_Object
7469 code_conversion_restore (Lisp_Object arg)
7470 {
7471   Lisp_Object current, workbuf;
7472   struct gcpro gcpro1;
7473
7474   GCPRO1 (arg);
7475   current = XCAR (arg);
7476   workbuf = XCDR (arg);
7477   if (! NILP (workbuf))
7478     {
7479       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7480         reused_workbuf_in_use = 0;
7481       else
7482         Fkill_buffer (workbuf);
7483     }
7484   set_buffer_internal (XBUFFER (current));
7485   UNGCPRO;
7486   return Qnil;
7487 }
7488
7489 Lisp_Object
7490 code_conversion_save (bool with_work_buf, bool multibyte)
7491 {
7492   Lisp_Object workbuf = Qnil;
7493
7494   if (with_work_buf)
7495     workbuf = make_conversion_work_buffer (multibyte);
7496   record_unwind_protect (code_conversion_restore,
7497                          Fcons (Fcurrent_buffer (), workbuf));
7498   return workbuf;
7499 }
7500
7501 void
7502 decode_coding_gap (struct coding_system *coding,
7503                    ptrdiff_t chars, ptrdiff_t bytes)
7504 {
7505   ptrdiff_t count = SPECPDL_INDEX ();
7506   Lisp_Object attrs;
7507
7508   code_conversion_save (0, 0);
7509
7510   coding->src_object = Fcurrent_buffer ();
7511   coding->src_chars = chars;
7512   coding->src_bytes = bytes;
7513   coding->src_pos = -chars;
7514   coding->src_pos_byte = -bytes;
7515   coding->src_multibyte = chars < bytes;
7516   coding->dst_object = coding->src_object;
7517   coding->dst_pos = PT;
7518   coding->dst_pos_byte = PT_BYTE;
7519   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7520
7521   if (CODING_REQUIRE_DETECTION (coding))
7522     detect_coding (coding);
7523
7524   coding->mode |= CODING_MODE_LAST_BLOCK;
7525   current_buffer->text->inhibit_shrinking = 1;
7526   decode_coding (coding);
7527   current_buffer->text->inhibit_shrinking = 0;
7528
7529   attrs = CODING_ID_ATTRS (coding->id);
7530   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7531     {
7532       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7533       Lisp_Object val;
7534
7535       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7536       val = call1 (CODING_ATTR_POST_READ (attrs),
7537                    make_number (coding->produced_char));
7538       CHECK_NATNUM (val);
7539       coding->produced_char += Z - prev_Z;
7540       coding->produced += Z_BYTE - prev_Z_BYTE;
7541     }
7542
7543   unbind_to (count, Qnil);
7544 }
7545
7546
7547 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7548    SRC_OBJECT into DST_OBJECT by coding context CODING.
7549
7550    SRC_OBJECT is a buffer, a string, or Qnil.
7551
7552    If it is a buffer, the text is at point of the buffer.  FROM and TO
7553    are positions in the buffer.
7554
7555    If it is a string, the text is at the beginning of the string.
7556    FROM and TO are indices to the string.
7557
7558    If it is nil, the text is at coding->source.  FROM and TO are
7559    indices to coding->source.
7560
7561    DST_OBJECT is a buffer, Qt, or Qnil.
7562
7563    If it is a buffer, the decoded text is inserted at point of the
7564    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7565    is deleted.
7566
7567    If it is Qt, a string is made from the decoded text, and
7568    set in CODING->dst_object.
7569
7570    If it is Qnil, the decoded text is stored at CODING->destination.
7571    The caller must allocate CODING->dst_bytes bytes at
7572    CODING->destination by xmalloc.  If the decoded text is longer than
7573    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7574  */
7575
7576 void
7577 decode_coding_object (struct coding_system *coding,
7578                       Lisp_Object src_object,
7579                       ptrdiff_t from, ptrdiff_t from_byte,
7580                       ptrdiff_t to, ptrdiff_t to_byte,
7581                       Lisp_Object dst_object)
7582 {
7583   ptrdiff_t count = SPECPDL_INDEX ();
7584   unsigned char *destination IF_LINT (= NULL);
7585   ptrdiff_t dst_bytes IF_LINT (= 0);
7586   ptrdiff_t chars = to - from;
7587   ptrdiff_t bytes = to_byte - from_byte;
7588   Lisp_Object attrs;
7589   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7590   bool need_marker_adjustment = 0;
7591   Lisp_Object old_deactivate_mark;
7592
7593   old_deactivate_mark = Vdeactivate_mark;
7594
7595   if (NILP (dst_object))
7596     {
7597       destination = coding->destination;
7598       dst_bytes = coding->dst_bytes;
7599     }
7600
7601   coding->src_object = src_object;
7602   coding->src_chars = chars;
7603   coding->src_bytes = bytes;
7604   coding->src_multibyte = chars < bytes;
7605
7606   if (STRINGP (src_object))
7607     {
7608       coding->src_pos = from;
7609       coding->src_pos_byte = from_byte;
7610     }
7611   else if (BUFFERP (src_object))
7612     {
7613       set_buffer_internal (XBUFFER (src_object));
7614       if (from != GPT)
7615         move_gap_both (from, from_byte);
7616       if (EQ (src_object, dst_object))
7617         {
7618           struct Lisp_Marker *tail;
7619
7620           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7621             {
7622               tail->need_adjustment
7623                 = tail->charpos == (tail->insertion_type ? from : to);
7624               need_marker_adjustment |= tail->need_adjustment;
7625             }
7626           saved_pt = PT, saved_pt_byte = PT_BYTE;
7627           TEMP_SET_PT_BOTH (from, from_byte);
7628           current_buffer->text->inhibit_shrinking = 1;
7629           del_range_both (from, from_byte, to, to_byte, 1);
7630           coding->src_pos = -chars;
7631           coding->src_pos_byte = -bytes;
7632         }
7633       else
7634         {
7635           coding->src_pos = from;
7636           coding->src_pos_byte = from_byte;
7637         }
7638     }
7639
7640   if (CODING_REQUIRE_DETECTION (coding))
7641     detect_coding (coding);
7642   attrs = CODING_ID_ATTRS (coding->id);
7643
7644   if (EQ (dst_object, Qt)
7645       || (! NILP (CODING_ATTR_POST_READ (attrs))
7646           && NILP (dst_object)))
7647     {
7648       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7649       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7650       coding->dst_pos = BEG;
7651       coding->dst_pos_byte = BEG_BYTE;
7652     }
7653   else if (BUFFERP (dst_object))
7654     {
7655       code_conversion_save (0, 0);
7656       coding->dst_object = dst_object;
7657       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7658       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7659       coding->dst_multibyte
7660         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7661     }
7662   else
7663     {
7664       code_conversion_save (0, 0);
7665       coding->dst_object = Qnil;
7666       /* Most callers presume this will return a multibyte result, and they
7667          won't use `binary' or `raw-text' anyway, so let's not worry about
7668          CODING_FOR_UNIBYTE.  */
7669       coding->dst_multibyte = 1;
7670     }
7671
7672   decode_coding (coding);
7673
7674   if (BUFFERP (coding->dst_object))
7675     set_buffer_internal (XBUFFER (coding->dst_object));
7676
7677   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7678     {
7679       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7680       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7681       Lisp_Object val;
7682
7683       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7684       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7685               old_deactivate_mark);
7686       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7687                         make_number (coding->produced_char));
7688       UNGCPRO;
7689       CHECK_NATNUM (val);
7690       coding->produced_char += Z - prev_Z;
7691       coding->produced += Z_BYTE - prev_Z_BYTE;
7692     }
7693
7694   if (EQ (dst_object, Qt))
7695     {
7696       coding->dst_object = Fbuffer_string ();
7697     }
7698   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7699     {
7700       set_buffer_internal (XBUFFER (coding->dst_object));
7701       if (dst_bytes < coding->produced)
7702         {
7703           destination = xrealloc (destination, coding->produced);
7704           if (! destination)
7705             {
7706               record_conversion_result (coding,
7707                                         CODING_RESULT_INSUFFICIENT_MEM);
7708               unbind_to (count, Qnil);
7709               return;
7710             }
7711           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7712             move_gap_both (BEGV, BEGV_BYTE);
7713           memcpy (destination, BEGV_ADDR, coding->produced);
7714           coding->destination = destination;
7715         }
7716     }
7717
7718   if (saved_pt >= 0)
7719     {
7720       /* This is the case of:
7721          (BUFFERP (src_object) && EQ (src_object, dst_object))
7722          As we have moved PT while replacing the original buffer
7723          contents, we must recover it now.  */
7724       set_buffer_internal (XBUFFER (src_object));
7725       current_buffer->text->inhibit_shrinking = 0;
7726       if (saved_pt < from)
7727         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7728       else if (saved_pt < from + chars)
7729         TEMP_SET_PT_BOTH (from, from_byte);
7730       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7731         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7732                           saved_pt_byte + (coding->produced - bytes));
7733       else
7734         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7735                           saved_pt_byte + (coding->produced - bytes));
7736
7737       if (need_marker_adjustment)
7738         {
7739           struct Lisp_Marker *tail;
7740
7741           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7742             if (tail->need_adjustment)
7743               {
7744                 tail->need_adjustment = 0;
7745                 if (tail->insertion_type)
7746                   {
7747                     tail->bytepos = from_byte;
7748                     tail->charpos = from;
7749                   }
7750                 else
7751                   {
7752                     tail->bytepos = from_byte + coding->produced;
7753                     tail->charpos
7754                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7755                          ? tail->bytepos : from + coding->produced_char);
7756                   }
7757               }
7758         }
7759     }
7760
7761   Vdeactivate_mark = old_deactivate_mark;
7762   unbind_to (count, coding->dst_object);
7763 }
7764
7765
7766 void
7767 encode_coding_object (struct coding_system *coding,
7768                       Lisp_Object src_object,
7769                       ptrdiff_t from, ptrdiff_t from_byte,
7770                       ptrdiff_t to, ptrdiff_t to_byte,
7771                       Lisp_Object dst_object)
7772 {
7773   ptrdiff_t count = SPECPDL_INDEX ();
7774   ptrdiff_t chars = to - from;
7775   ptrdiff_t bytes = to_byte - from_byte;
7776   Lisp_Object attrs;
7777   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7778   bool need_marker_adjustment = 0;
7779   bool kill_src_buffer = 0;
7780   Lisp_Object old_deactivate_mark;
7781
7782   old_deactivate_mark = Vdeactivate_mark;
7783
7784   coding->src_object = src_object;
7785   coding->src_chars = chars;
7786   coding->src_bytes = bytes;
7787   coding->src_multibyte = chars < bytes;
7788
7789   attrs = CODING_ID_ATTRS (coding->id);
7790
7791   if (EQ (src_object, dst_object))
7792     {
7793       struct Lisp_Marker *tail;
7794
7795       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7796         {
7797           tail->need_adjustment
7798             = tail->charpos == (tail->insertion_type ? from : to);
7799           need_marker_adjustment |= tail->need_adjustment;
7800         }
7801     }
7802
7803   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7804     {
7805       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7806       set_buffer_internal (XBUFFER (coding->src_object));
7807       if (STRINGP (src_object))
7808         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7809       else if (BUFFERP (src_object))
7810         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7811       else
7812         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7813
7814       if (EQ (src_object, dst_object))
7815         {
7816           set_buffer_internal (XBUFFER (src_object));
7817           saved_pt = PT, saved_pt_byte = PT_BYTE;
7818           del_range_both (from, from_byte, to, to_byte, 1);
7819           set_buffer_internal (XBUFFER (coding->src_object));
7820         }
7821
7822       {
7823         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7824
7825         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7826                 old_deactivate_mark);
7827         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7828                     make_number (BEG), make_number (Z));
7829         UNGCPRO;
7830       }
7831       if (XBUFFER (coding->src_object) != current_buffer)
7832         kill_src_buffer = 1;
7833       coding->src_object = Fcurrent_buffer ();
7834       if (BEG != GPT)
7835         move_gap_both (BEG, BEG_BYTE);
7836       coding->src_chars = Z - BEG;
7837       coding->src_bytes = Z_BYTE - BEG_BYTE;
7838       coding->src_pos = BEG;
7839       coding->src_pos_byte = BEG_BYTE;
7840       coding->src_multibyte = Z < Z_BYTE;
7841     }
7842   else if (STRINGP (src_object))
7843     {
7844       code_conversion_save (0, 0);
7845       coding->src_pos = from;
7846       coding->src_pos_byte = from_byte;
7847     }
7848   else if (BUFFERP (src_object))
7849     {
7850       code_conversion_save (0, 0);
7851       set_buffer_internal (XBUFFER (src_object));
7852       if (EQ (src_object, dst_object))
7853         {
7854           saved_pt = PT, saved_pt_byte = PT_BYTE;
7855           coding->src_object = del_range_1 (from, to, 1, 1);
7856           coding->src_pos = 0;
7857           coding->src_pos_byte = 0;
7858         }
7859       else
7860         {
7861           if (from < GPT && to >= GPT)
7862             move_gap_both (from, from_byte);
7863           coding->src_pos = from;
7864           coding->src_pos_byte = from_byte;
7865         }
7866     }
7867   else
7868     code_conversion_save (0, 0);
7869
7870   if (BUFFERP (dst_object))
7871     {
7872       coding->dst_object = dst_object;
7873       if (EQ (src_object, dst_object))
7874         {
7875           coding->dst_pos = from;
7876           coding->dst_pos_byte = from_byte;
7877         }
7878       else
7879         {
7880           struct buffer *current = current_buffer;
7881
7882           set_buffer_temp (XBUFFER (dst_object));
7883           coding->dst_pos = PT;
7884           coding->dst_pos_byte = PT_BYTE;
7885           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7886           set_buffer_temp (current);
7887         }
7888       coding->dst_multibyte
7889         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7890     }
7891   else if (EQ (dst_object, Qt))
7892     {
7893       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7894       coding->dst_object = Qnil;
7895       coding->destination = xmalloc (dst_bytes);
7896       coding->dst_bytes = dst_bytes;
7897       coding->dst_multibyte = 0;
7898     }
7899   else
7900     {
7901       coding->dst_object = Qnil;
7902       coding->dst_multibyte = 0;
7903     }
7904
7905   encode_coding (coding);
7906
7907   if (EQ (dst_object, Qt))
7908     {
7909       if (BUFFERP (coding->dst_object))
7910         coding->dst_object = Fbuffer_string ();
7911       else
7912         {
7913           coding->dst_object
7914             = make_unibyte_string ((char *) coding->destination,
7915                                    coding->produced);
7916           xfree (coding->destination);
7917         }
7918     }
7919
7920   if (saved_pt >= 0)
7921     {
7922       /* This is the case of:
7923          (BUFFERP (src_object) && EQ (src_object, dst_object))
7924          As we have moved PT while replacing the original buffer
7925          contents, we must recover it now.  */
7926       set_buffer_internal (XBUFFER (src_object));
7927       if (saved_pt < from)
7928         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7929       else if (saved_pt < from + chars)
7930         TEMP_SET_PT_BOTH (from, from_byte);
7931       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7932         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7933                           saved_pt_byte + (coding->produced - bytes));
7934       else
7935         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7936                           saved_pt_byte + (coding->produced - bytes));
7937
7938       if (need_marker_adjustment)
7939         {
7940           struct Lisp_Marker *tail;
7941
7942           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7943             if (tail->need_adjustment)
7944               {
7945                 tail->need_adjustment = 0;
7946                 if (tail->insertion_type)
7947                   {
7948                     tail->bytepos = from_byte;
7949                     tail->charpos = from;
7950                   }
7951                 else
7952                   {
7953                     tail->bytepos = from_byte + coding->produced;
7954                     tail->charpos
7955                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7956                          ? tail->bytepos : from + coding->produced_char);
7957                   }
7958               }
7959         }
7960     }
7961
7962   if (kill_src_buffer)
7963     Fkill_buffer (coding->src_object);
7964
7965   Vdeactivate_mark = old_deactivate_mark;
7966   unbind_to (count, Qnil);
7967 }
7968
7969
7970 Lisp_Object
7971 preferred_coding_system (void)
7972 {
7973   int id = coding_categories[coding_priorities[0]].id;
7974
7975   return CODING_ID_NAME (id);
7976 }
7977
7978 #if defined (WINDOWSNT) || defined (HAVE_NTGUI)
7979
7980 Lisp_Object
7981 from_unicode (Lisp_Object str)
7982 {
7983   CHECK_STRING (str);
7984   if (!STRING_MULTIBYTE (str) &&
7985       SBYTES (str) & 1)
7986     {
7987       str = Fsubstring (str, make_number (0), make_number (-1));
7988     }
7989
7990   return code_convert_string_norecord (str, Qutf_16le, 0);
7991 }
7992
7993 wchar_t *
7994 to_unicode (Lisp_Object str, Lisp_Object *buf)
7995 {
7996   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7997   /* We need to make a another copy (in addition to the one made by
7998      code_convert_string_norecord) to ensure that the final string is
7999      _doubly_ zero terminated --- that is, that the string is
8000      terminated by two zero bytes and one utf-16le null character.
8001      Because strings are already terminated with a single zero byte,
8002      we just add one additional zero. */
8003   str = make_uninit_string (SBYTES (*buf) + 1);
8004   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8005   SDATA (str) [SBYTES (*buf)] = '\0';
8006   *buf = str;
8007   return WCSDATA (*buf);
8008 }
8009 #endif /* WINDOWSNT || HAVE_NTGUI */
8010
8011 \f
8012 #ifdef emacs
8013 /*** 8. Emacs Lisp library functions ***/
8014
8015 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8016        doc: /* Return t if OBJECT is nil or a coding-system.
8017 See the documentation of `define-coding-system' for information
8018 about coding-system objects.  */)
8019   (Lisp_Object object)
8020 {
8021   if (NILP (object)
8022       || CODING_SYSTEM_ID (object) >= 0)
8023     return Qt;
8024   if (! SYMBOLP (object)
8025       || NILP (Fget (object, Qcoding_system_define_form)))
8026     return Qnil;
8027   return Qt;
8028 }
8029
8030 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8031        Sread_non_nil_coding_system, 1, 1, 0,
8032        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8033   (Lisp_Object prompt)
8034 {
8035   Lisp_Object val;
8036   do
8037     {
8038       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8039                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8040     }
8041   while (SCHARS (val) == 0);
8042   return (Fintern (val, Qnil));
8043 }
8044
8045 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8046        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8047 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8048 Ignores case when completing coding systems (all Emacs coding systems
8049 are lower-case).  */)
8050   (Lisp_Object prompt, Lisp_Object default_coding_system)
8051 {
8052   Lisp_Object val;
8053   ptrdiff_t count = SPECPDL_INDEX ();
8054
8055   if (SYMBOLP (default_coding_system))
8056     default_coding_system = SYMBOL_NAME (default_coding_system);
8057   specbind (Qcompletion_ignore_case, Qt);
8058   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8059                           Qt, Qnil, Qcoding_system_history,
8060                           default_coding_system, Qnil);
8061   unbind_to (count, Qnil);
8062   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8063 }
8064
8065 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8066        1, 1, 0,
8067        doc: /* Check validity of CODING-SYSTEM.
8068 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8069 It is valid if it is nil or a symbol defined as a coding system by the
8070 function `define-coding-system'.  */)
8071   (Lisp_Object coding_system)
8072 {
8073   Lisp_Object define_form;
8074
8075   define_form = Fget (coding_system, Qcoding_system_define_form);
8076   if (! NILP (define_form))
8077     {
8078       Fput (coding_system, Qcoding_system_define_form, Qnil);
8079       safe_eval (define_form);
8080     }
8081   if (!NILP (Fcoding_system_p (coding_system)))
8082     return coding_system;
8083   xsignal1 (Qcoding_system_error, coding_system);
8084 }
8085
8086 \f
8087 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8088    HIGHEST, return the coding system of the highest
8089    priority among the detected coding systems.  Otherwise return a
8090    list of detected coding systems sorted by their priorities.  If
8091    MULTIBYTEP, it is assumed that the bytes are in correct
8092    multibyte form but contains only ASCII and eight-bit chars.
8093    Otherwise, the bytes are raw bytes.
8094
8095    CODING-SYSTEM controls the detection as below:
8096
8097    If it is nil, detect both text-format and eol-format.  If the
8098    text-format part of CODING-SYSTEM is already specified
8099    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8100    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8101    detect only text-format.  */
8102
8103 Lisp_Object
8104 detect_coding_system (const unsigned char *src,
8105                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8106                       bool highest, bool multibytep,
8107                       Lisp_Object coding_system)
8108 {
8109   const unsigned char *src_end = src + src_bytes;
8110   Lisp_Object attrs, eol_type;
8111   Lisp_Object val = Qnil;
8112   struct coding_system coding;
8113   ptrdiff_t id;
8114   struct coding_detection_info detect_info;
8115   enum coding_category base_category;
8116   bool null_byte_found = 0, eight_bit_found = 0;
8117
8118   if (NILP (coding_system))
8119     coding_system = Qundecided;
8120   setup_coding_system (coding_system, &coding);
8121   attrs = CODING_ID_ATTRS (coding.id);
8122   eol_type = CODING_ID_EOL_TYPE (coding.id);
8123   coding_system = CODING_ATTR_BASE_NAME (attrs);
8124
8125   coding.source = src;
8126   coding.src_chars = src_chars;
8127   coding.src_bytes = src_bytes;
8128   coding.src_multibyte = multibytep;
8129   coding.consumed = 0;
8130   coding.mode |= CODING_MODE_LAST_BLOCK;
8131   coding.head_ascii = 0;
8132
8133   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8134
8135   /* At first, detect text-format if necessary.  */
8136   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8137   if (base_category == coding_category_undecided)
8138     {
8139       enum coding_category category IF_LINT (= 0);
8140       struct coding_system *this IF_LINT (= NULL);
8141       int c, i;
8142
8143       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8144       for (; src < src_end; src++)
8145         {
8146           c = *src;
8147           if (c & 0x80)
8148             {
8149               eight_bit_found = 1;
8150               if (null_byte_found)
8151                 break;
8152             }
8153           else if (c < 0x20)
8154             {
8155               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8156                   && ! inhibit_iso_escape_detection
8157                   && ! detect_info.checked)
8158                 {
8159                   if (detect_coding_iso_2022 (&coding, &detect_info))
8160                     {
8161                       /* We have scanned the whole data.  */
8162                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8163                         {
8164                           /* We didn't find an 8-bit code.  We may
8165                              have found a null-byte, but it's very
8166                              rare that a binary file confirm to
8167                              ISO-2022.  */
8168                           src = src_end;
8169                           coding.head_ascii = src - coding.source;
8170                         }
8171                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8172                       break;
8173                     }
8174                 }
8175               else if (! c && !inhibit_null_byte_detection)
8176                 {
8177                   null_byte_found = 1;
8178                   if (eight_bit_found)
8179                     break;
8180                 }
8181               if (! eight_bit_found)
8182                 coding.head_ascii++;
8183             }
8184           else if (! eight_bit_found)
8185             coding.head_ascii++;
8186         }
8187
8188       if (null_byte_found || eight_bit_found
8189           || coding.head_ascii < coding.src_bytes
8190           || detect_info.found)
8191         {
8192           if (coding.head_ascii == coding.src_bytes)
8193             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8194             for (i = 0; i < coding_category_raw_text; i++)
8195               {
8196                 category = coding_priorities[i];
8197                 this = coding_categories + category;
8198                 if (detect_info.found & (1 << category))
8199                   break;
8200               }
8201           else
8202             {
8203               if (null_byte_found)
8204                 {
8205                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8206                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8207                 }
8208               for (i = 0; i < coding_category_raw_text; i++)
8209                 {
8210                   category = coding_priorities[i];
8211                   this = coding_categories + category;
8212
8213                   if (this->id < 0)
8214                     {
8215                       /* No coding system of this category is defined.  */
8216                       detect_info.rejected |= (1 << category);
8217                     }
8218                   else if (category >= coding_category_raw_text)
8219                     continue;
8220                   else if (detect_info.checked & (1 << category))
8221                     {
8222                       if (highest
8223                           && (detect_info.found & (1 << category)))
8224                         break;
8225                     }
8226                   else if ((*(this->detector)) (&coding, &detect_info)
8227                            && highest
8228                            && (detect_info.found & (1 << category)))
8229                     {
8230                       if (category == coding_category_utf_16_auto)
8231                         {
8232                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8233                             category = coding_category_utf_16_le;
8234                           else
8235                             category = coding_category_utf_16_be;
8236                         }
8237                       break;
8238                     }
8239                 }
8240             }
8241         }
8242
8243       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8244           || null_byte_found)
8245         {
8246           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8247           id = CODING_SYSTEM_ID (Qno_conversion);
8248           val = Fcons (make_number (id), Qnil);
8249         }
8250       else if (! detect_info.rejected && ! detect_info.found)
8251         {
8252           detect_info.found = CATEGORY_MASK_ANY;
8253           id = coding_categories[coding_category_undecided].id;
8254           val = Fcons (make_number (id), Qnil);
8255         }
8256       else if (highest)
8257         {
8258           if (detect_info.found)
8259             {
8260               detect_info.found = 1 << category;
8261               val = Fcons (make_number (this->id), Qnil);
8262             }
8263           else
8264             for (i = 0; i < coding_category_raw_text; i++)
8265               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8266                 {
8267                   detect_info.found = 1 << coding_priorities[i];
8268                   id = coding_categories[coding_priorities[i]].id;
8269                   val = Fcons (make_number (id), Qnil);
8270                   break;
8271                 }
8272         }
8273       else
8274         {
8275           int mask = detect_info.rejected | detect_info.found;
8276           int found = 0;
8277
8278           for (i = coding_category_raw_text - 1; i >= 0; i--)
8279             {
8280               category = coding_priorities[i];
8281               if (! (mask & (1 << category)))
8282                 {
8283                   found |= 1 << category;
8284                   id = coding_categories[category].id;
8285                   if (id >= 0)
8286                     val = Fcons (make_number (id), val);
8287                 }
8288             }
8289           for (i = coding_category_raw_text - 1; i >= 0; i--)
8290             {
8291               category = coding_priorities[i];
8292               if (detect_info.found & (1 << category))
8293                 {
8294                   id = coding_categories[category].id;
8295                   val = Fcons (make_number (id), val);
8296                 }
8297             }
8298           detect_info.found |= found;
8299         }
8300     }
8301   else if (base_category == coding_category_utf_8_auto)
8302     {
8303       if (detect_coding_utf_8 (&coding, &detect_info))
8304         {
8305           struct coding_system *this;
8306
8307           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8308             this = coding_categories + coding_category_utf_8_sig;
8309           else
8310             this = coding_categories + coding_category_utf_8_nosig;
8311           val = Fcons (make_number (this->id), Qnil);
8312         }
8313     }
8314   else if (base_category == coding_category_utf_16_auto)
8315     {
8316       if (detect_coding_utf_16 (&coding, &detect_info))
8317         {
8318           struct coding_system *this;
8319
8320           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8321             this = coding_categories + coding_category_utf_16_le;
8322           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8323             this = coding_categories + coding_category_utf_16_be;
8324           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8325             this = coding_categories + coding_category_utf_16_be_nosig;
8326           else
8327             this = coding_categories + coding_category_utf_16_le_nosig;
8328           val = Fcons (make_number (this->id), Qnil);
8329         }
8330     }
8331   else
8332     {
8333       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8334       val = Fcons (make_number (coding.id), Qnil);
8335     }
8336
8337   /* Then, detect eol-format if necessary.  */
8338   {
8339     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8340     Lisp_Object tail;
8341
8342     if (VECTORP (eol_type))
8343       {
8344         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8345           {
8346             if (null_byte_found)
8347               normal_eol = EOL_SEEN_LF;
8348             else
8349               normal_eol = detect_eol (coding.source, src_bytes,
8350                                        coding_category_raw_text);
8351           }
8352         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8353                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8354           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8355                                       coding_category_utf_16_be);
8356         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8357                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8358           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8359                                       coding_category_utf_16_le);
8360       }
8361     else
8362       {
8363         if (EQ (eol_type, Qunix))
8364           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8365         else if (EQ (eol_type, Qdos))
8366           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8367         else
8368           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8369       }
8370
8371     for (tail = val; CONSP (tail); tail = XCDR (tail))
8372       {
8373         enum coding_category category;
8374         int this_eol;
8375
8376         id = XINT (XCAR (tail));
8377         attrs = CODING_ID_ATTRS (id);
8378         category = XINT (CODING_ATTR_CATEGORY (attrs));
8379         eol_type = CODING_ID_EOL_TYPE (id);
8380         if (VECTORP (eol_type))
8381           {
8382             if (category == coding_category_utf_16_be
8383                 || category == coding_category_utf_16_be_nosig)
8384               this_eol = utf_16_be_eol;
8385             else if (category == coding_category_utf_16_le
8386                      || category == coding_category_utf_16_le_nosig)
8387               this_eol = utf_16_le_eol;
8388             else
8389               this_eol = normal_eol;
8390
8391             if (this_eol == EOL_SEEN_LF)
8392               XSETCAR (tail, AREF (eol_type, 0));
8393             else if (this_eol == EOL_SEEN_CRLF)
8394               XSETCAR (tail, AREF (eol_type, 1));
8395             else if (this_eol == EOL_SEEN_CR)
8396               XSETCAR (tail, AREF (eol_type, 2));
8397             else
8398               XSETCAR (tail, CODING_ID_NAME (id));
8399           }
8400         else
8401           XSETCAR (tail, CODING_ID_NAME (id));
8402       }
8403   }
8404
8405   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8406 }
8407
8408
8409 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8410        2, 3, 0,
8411        doc: /* Detect coding system of the text in the region between START and END.
8412 Return a list of possible coding systems ordered by priority.
8413 The coding systems to try and their priorities follows what
8414 the function `coding-system-priority-list' (which see) returns.
8415
8416 If only ASCII characters are found (except for such ISO-2022 control
8417 characters as ESC), it returns a list of single element `undecided'
8418 or its subsidiary coding system according to a detected end-of-line
8419 format.
8420
8421 If optional argument HIGHEST is non-nil, return the coding system of
8422 highest priority.  */)
8423   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8424 {
8425   ptrdiff_t from, to;
8426   ptrdiff_t from_byte, to_byte;
8427
8428   CHECK_NUMBER_COERCE_MARKER (start);
8429   CHECK_NUMBER_COERCE_MARKER (end);
8430
8431   validate_region (&start, &end);
8432   from = XINT (start), to = XINT (end);
8433   from_byte = CHAR_TO_BYTE (from);
8434   to_byte = CHAR_TO_BYTE (to);
8435
8436   if (from < GPT && to >= GPT)
8437     move_gap_both (to, to_byte);
8438
8439   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8440                                to - from, to_byte - from_byte,
8441                                !NILP (highest),
8442                                !NILP (BVAR (current_buffer
8443                                       , enable_multibyte_characters)),
8444                                Qnil);
8445 }
8446
8447 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8448        1, 2, 0,
8449        doc: /* Detect coding system of the text in STRING.
8450 Return a list of possible coding systems ordered by priority.
8451 The coding systems to try and their priorities follows what
8452 the function `coding-system-priority-list' (which see) returns.
8453
8454 If only ASCII characters are found (except for such ISO-2022 control
8455 characters as ESC), it returns a list of single element `undecided'
8456 or its subsidiary coding system according to a detected end-of-line
8457 format.
8458
8459 If optional argument HIGHEST is non-nil, return the coding system of
8460 highest priority.  */)
8461   (Lisp_Object string, Lisp_Object highest)
8462 {
8463   CHECK_STRING (string);
8464
8465   return detect_coding_system (SDATA (string),
8466                                SCHARS (string), SBYTES (string),
8467                                !NILP (highest), STRING_MULTIBYTE (string),
8468                                Qnil);
8469 }
8470
8471
8472 static bool
8473 char_encodable_p (int c, Lisp_Object attrs)
8474 {
8475   Lisp_Object tail;
8476   struct charset *charset;
8477   Lisp_Object translation_table;
8478
8479   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8480   if (! NILP (translation_table))
8481     c = translate_char (translation_table, c);
8482   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8483        CONSP (tail); tail = XCDR (tail))
8484     {
8485       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8486       if (CHAR_CHARSET_P (c, charset))
8487         break;
8488     }
8489   return (! NILP (tail));
8490 }
8491
8492
8493 /* Return a list of coding systems that safely encode the text between
8494    START and END.  If EXCLUDE is non-nil, it is a list of coding
8495    systems not to check.  The returned list doesn't contain any such
8496    coding systems.  In any case, if the text contains only ASCII or is
8497    unibyte, return t.  */
8498
8499 DEFUN ("find-coding-systems-region-internal",
8500        Ffind_coding_systems_region_internal,
8501        Sfind_coding_systems_region_internal, 2, 3, 0,
8502        doc: /* Internal use only.  */)
8503   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8504 {
8505   Lisp_Object coding_attrs_list, safe_codings;
8506   ptrdiff_t start_byte, end_byte;
8507   const unsigned char *p, *pbeg, *pend;
8508   int c;
8509   Lisp_Object tail, elt, work_table;
8510
8511   if (STRINGP (start))
8512     {
8513       if (!STRING_MULTIBYTE (start)
8514           || SCHARS (start) == SBYTES (start))
8515         return Qt;
8516       start_byte = 0;
8517       end_byte = SBYTES (start);
8518     }
8519   else
8520     {
8521       CHECK_NUMBER_COERCE_MARKER (start);
8522       CHECK_NUMBER_COERCE_MARKER (end);
8523       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8524         args_out_of_range (start, end);
8525       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8526         return Qt;
8527       start_byte = CHAR_TO_BYTE (XINT (start));
8528       end_byte = CHAR_TO_BYTE (XINT (end));
8529       if (XINT (end) - XINT (start) == end_byte - start_byte)
8530         return Qt;
8531
8532       if (XINT (start) < GPT && XINT (end) > GPT)
8533         {
8534           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8535             move_gap_both (XINT (start), start_byte);
8536           else
8537             move_gap_both (XINT (end), end_byte);
8538         }
8539     }
8540
8541   coding_attrs_list = Qnil;
8542   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8543     if (NILP (exclude)
8544         || NILP (Fmemq (XCAR (tail), exclude)))
8545       {
8546         Lisp_Object attrs;
8547
8548         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8549         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8550             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8551           {
8552             ASET (attrs, coding_attr_trans_tbl,
8553                   get_translation_table (attrs, 1, NULL));
8554             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8555           }
8556       }
8557
8558   if (STRINGP (start))
8559     p = pbeg = SDATA (start);
8560   else
8561     p = pbeg = BYTE_POS_ADDR (start_byte);
8562   pend = p + (end_byte - start_byte);
8563
8564   while (p < pend && ASCII_BYTE_P (*p)) p++;
8565   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8566
8567   work_table = Fmake_char_table (Qnil, Qnil);
8568   while (p < pend)
8569     {
8570       if (ASCII_BYTE_P (*p))
8571         p++;
8572       else
8573         {
8574           c = STRING_CHAR_ADVANCE (p);
8575           if (!NILP (char_table_ref (work_table, c)))
8576             /* This character was already checked.  Ignore it.  */
8577             continue;
8578
8579           charset_map_loaded = 0;
8580           for (tail = coding_attrs_list; CONSP (tail);)
8581             {
8582               elt = XCAR (tail);
8583               if (NILP (elt))
8584                 tail = XCDR (tail);
8585               else if (char_encodable_p (c, elt))
8586                 tail = XCDR (tail);
8587               else if (CONSP (XCDR (tail)))
8588                 {
8589                   XSETCAR (tail, XCAR (XCDR (tail)));
8590                   XSETCDR (tail, XCDR (XCDR (tail)));
8591                 }
8592               else
8593                 {
8594                   XSETCAR (tail, Qnil);
8595                   tail = XCDR (tail);
8596                 }
8597             }
8598           if (charset_map_loaded)
8599             {
8600               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8601
8602               if (STRINGP (start))
8603                 pbeg = SDATA (start);
8604               else
8605                 pbeg = BYTE_POS_ADDR (start_byte);
8606               p = pbeg + p_offset;
8607               pend = pbeg + pend_offset;
8608             }
8609           char_table_set (work_table, c, Qt);
8610         }
8611     }
8612
8613   safe_codings = list2 (Qraw_text, Qno_conversion);
8614   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8615     if (! NILP (XCAR (tail)))
8616       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8617
8618   return safe_codings;
8619 }
8620
8621
8622 DEFUN ("unencodable-char-position", Funencodable_char_position,
8623        Sunencodable_char_position, 3, 5, 0,
8624        doc: /*
8625 Return position of first un-encodable character in a region.
8626 START and END specify the region and CODING-SYSTEM specifies the
8627 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8628
8629 If optional 4th argument COUNT is non-nil, it specifies at most how
8630 many un-encodable characters to search.  In this case, the value is a
8631 list of positions.
8632
8633 If optional 5th argument STRING is non-nil, it is a string to search
8634 for un-encodable characters.  In that case, START and END are indexes
8635 to the string.  */)
8636   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8637 {
8638   EMACS_INT n;
8639   struct coding_system coding;
8640   Lisp_Object attrs, charset_list, translation_table;
8641   Lisp_Object positions;
8642   ptrdiff_t from, to;
8643   const unsigned char *p, *stop, *pend;
8644   bool ascii_compatible;
8645
8646   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8647   attrs = CODING_ID_ATTRS (coding.id);
8648   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8649     return Qnil;
8650   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8651   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8652   translation_table = get_translation_table (attrs, 1, NULL);
8653
8654   if (NILP (string))
8655     {
8656       validate_region (&start, &end);
8657       from = XINT (start);
8658       to = XINT (end);
8659       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8660           || (ascii_compatible
8661               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8662         return Qnil;
8663       p = CHAR_POS_ADDR (from);
8664       pend = CHAR_POS_ADDR (to);
8665       if (from < GPT && to >= GPT)
8666         stop = GPT_ADDR;
8667       else
8668         stop = pend;
8669     }
8670   else
8671     {
8672       CHECK_STRING (string);
8673       CHECK_NATNUM (start);
8674       CHECK_NATNUM (end);
8675       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8676         args_out_of_range_3 (string, start, end);
8677       from = XINT (start);
8678       to = XINT (end);
8679       if (! STRING_MULTIBYTE (string))
8680         return Qnil;
8681       p = SDATA (string) + string_char_to_byte (string, from);
8682       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8683       if (ascii_compatible && (to - from) == (pend - p))
8684         return Qnil;
8685     }
8686
8687   if (NILP (count))
8688     n = 1;
8689   else
8690     {
8691       CHECK_NATNUM (count);
8692       n = XINT (count);
8693     }
8694
8695   positions = Qnil;
8696   charset_map_loaded = 0;
8697   while (1)
8698     {
8699       int c;
8700
8701       if (ascii_compatible)
8702         while (p < stop && ASCII_BYTE_P (*p))
8703           p++, from++;
8704       if (p >= stop)
8705         {
8706           if (p >= pend)
8707             break;
8708           stop = pend;
8709           p = GAP_END_ADDR;
8710         }
8711
8712       c = STRING_CHAR_ADVANCE (p);
8713       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8714           && ! char_charset (translate_char (translation_table, c),
8715                              charset_list, NULL))
8716         {
8717           positions = Fcons (make_number (from), positions);
8718           n--;
8719           if (n == 0)
8720             break;
8721         }
8722
8723       from++;
8724       if (charset_map_loaded && NILP (string))
8725         {
8726           p = CHAR_POS_ADDR (from);
8727           pend = CHAR_POS_ADDR (to);
8728           if (from < GPT && to >= GPT)
8729             stop = GPT_ADDR;
8730           else
8731             stop = pend;
8732           charset_map_loaded = 0;
8733         }
8734     }
8735
8736   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8737 }
8738
8739
8740 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8741        Scheck_coding_systems_region, 3, 3, 0,
8742        doc: /* Check if the region is encodable by coding systems.
8743
8744 START and END are buffer positions specifying the region.
8745 CODING-SYSTEM-LIST is a list of coding systems to check.
8746
8747 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8748 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8749 whole region, POS0, POS1, ... are buffer positions where non-encodable
8750 characters are found.
8751
8752 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8753 value is nil.
8754
8755 START may be a string.  In that case, check if the string is
8756 encodable, and the value contains indices to the string instead of
8757 buffer positions.  END is ignored.
8758
8759 If the current buffer (or START if it is a string) is unibyte, the value
8760 is nil.  */)
8761   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8762 {
8763   Lisp_Object list;
8764   ptrdiff_t start_byte, end_byte;
8765   ptrdiff_t pos;
8766   const unsigned char *p, *pbeg, *pend;
8767   int c;
8768   Lisp_Object tail, elt, attrs;
8769
8770   if (STRINGP (start))
8771     {
8772       if (!STRING_MULTIBYTE (start)
8773           || SCHARS (start) == SBYTES (start))
8774         return Qnil;
8775       start_byte = 0;
8776       end_byte = SBYTES (start);
8777       pos = 0;
8778     }
8779   else
8780     {
8781       CHECK_NUMBER_COERCE_MARKER (start);
8782       CHECK_NUMBER_COERCE_MARKER (end);
8783       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8784         args_out_of_range (start, end);
8785       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8786         return Qnil;
8787       start_byte = CHAR_TO_BYTE (XINT (start));
8788       end_byte = CHAR_TO_BYTE (XINT (end));
8789       if (XINT (end) - XINT (start) == end_byte - start_byte)
8790         return Qnil;
8791
8792       if (XINT (start) < GPT && XINT (end) > GPT)
8793         {
8794           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8795             move_gap_both (XINT (start), start_byte);
8796           else
8797             move_gap_both (XINT (end), end_byte);
8798         }
8799       pos = XINT (start);
8800     }
8801
8802   list = Qnil;
8803   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8804     {
8805       elt = XCAR (tail);
8806       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8807       ASET (attrs, coding_attr_trans_tbl,
8808             get_translation_table (attrs, 1, NULL));
8809       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8810     }
8811
8812   if (STRINGP (start))
8813     p = pbeg = SDATA (start);
8814   else
8815     p = pbeg = BYTE_POS_ADDR (start_byte);
8816   pend = p + (end_byte - start_byte);
8817
8818   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8819   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8820
8821   while (p < pend)
8822     {
8823       if (ASCII_BYTE_P (*p))
8824         p++;
8825       else
8826         {
8827           c = STRING_CHAR_ADVANCE (p);
8828
8829           charset_map_loaded = 0;
8830           for (tail = list; CONSP (tail); tail = XCDR (tail))
8831             {
8832               elt = XCDR (XCAR (tail));
8833               if (! char_encodable_p (c, XCAR (elt)))
8834                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8835             }
8836           if (charset_map_loaded)
8837             {
8838               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8839
8840               if (STRINGP (start))
8841                 pbeg = SDATA (start);
8842               else
8843                 pbeg = BYTE_POS_ADDR (start_byte);
8844               p = pbeg + p_offset;
8845               pend = pbeg + pend_offset;
8846             }
8847         }
8848       pos++;
8849     }
8850
8851   tail = list;
8852   list = Qnil;
8853   for (; CONSP (tail); tail = XCDR (tail))
8854     {
8855       elt = XCAR (tail);
8856       if (CONSP (XCDR (XCDR (elt))))
8857         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8858                       list);
8859     }
8860
8861   return list;
8862 }
8863
8864
8865 static Lisp_Object
8866 code_convert_region (Lisp_Object start, Lisp_Object end,
8867                      Lisp_Object coding_system, Lisp_Object dst_object,
8868                      bool encodep, bool norecord)
8869 {
8870   struct coding_system coding;
8871   ptrdiff_t from, from_byte, to, to_byte;
8872   Lisp_Object src_object;
8873
8874   CHECK_NUMBER_COERCE_MARKER (start);
8875   CHECK_NUMBER_COERCE_MARKER (end);
8876   if (NILP (coding_system))
8877     coding_system = Qno_conversion;
8878   else
8879     CHECK_CODING_SYSTEM (coding_system);
8880   src_object = Fcurrent_buffer ();
8881   if (NILP (dst_object))
8882     dst_object = src_object;
8883   else if (! EQ (dst_object, Qt))
8884     CHECK_BUFFER (dst_object);
8885
8886   validate_region (&start, &end);
8887   from = XFASTINT (start);
8888   from_byte = CHAR_TO_BYTE (from);
8889   to = XFASTINT (end);
8890   to_byte = CHAR_TO_BYTE (to);
8891
8892   setup_coding_system (coding_system, &coding);
8893   coding.mode |= CODING_MODE_LAST_BLOCK;
8894
8895   if (encodep)
8896     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8897                           dst_object);
8898   else
8899     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8900                           dst_object);
8901   if (! norecord)
8902     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8903
8904   return (BUFFERP (dst_object)
8905           ? make_number (coding.produced_char)
8906           : coding.dst_object);
8907 }
8908
8909
8910 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8911        3, 4, "r\nzCoding system: ",
8912        doc: /* Decode the current region from the specified coding system.
8913 When called from a program, takes four arguments:
8914         START, END, CODING-SYSTEM, and DESTINATION.
8915 START and END are buffer positions.
8916
8917 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8918 If nil, the region between START and END is replaced by the decoded text.
8919 If buffer, the decoded text is inserted in that buffer after point (point
8920 does not move).
8921 In those cases, the length of the decoded text is returned.
8922 If DESTINATION is t, the decoded text is returned.
8923
8924 This function sets `last-coding-system-used' to the precise coding system
8925 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8926 not fully specified.)  */)
8927   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8928 {
8929   return code_convert_region (start, end, coding_system, destination, 0, 0);
8930 }
8931
8932 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8933        3, 4, "r\nzCoding system: ",
8934        doc: /* Encode the current region by specified coding system.
8935 When called from a program, takes four arguments:
8936         START, END, CODING-SYSTEM and DESTINATION.
8937 START and END are buffer positions.
8938
8939 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8940 If nil, the region between START and END is replace by the encoded text.
8941 If buffer, the encoded text is inserted in that buffer after point (point
8942 does not move).
8943 In those cases, the length of the encoded text is returned.
8944 If DESTINATION is t, the encoded text is returned.
8945
8946 This function sets `last-coding-system-used' to the precise coding system
8947 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8948 not fully specified.)  */)
8949   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8950 {
8951   return code_convert_region (start, end, coding_system, destination, 1, 0);
8952 }
8953
8954 Lisp_Object
8955 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8956                      Lisp_Object dst_object, bool encodep, bool nocopy,
8957                      bool norecord)
8958 {
8959   struct coding_system coding;
8960   ptrdiff_t chars, bytes;
8961
8962   CHECK_STRING (string);
8963   if (NILP (coding_system))
8964     {
8965       if (! norecord)
8966         Vlast_coding_system_used = Qno_conversion;
8967       if (NILP (dst_object))
8968         return (nocopy ? Fcopy_sequence (string) : string);
8969     }
8970
8971   if (NILP (coding_system))
8972     coding_system = Qno_conversion;
8973   else
8974     CHECK_CODING_SYSTEM (coding_system);
8975   if (NILP (dst_object))
8976     dst_object = Qt;
8977   else if (! EQ (dst_object, Qt))
8978     CHECK_BUFFER (dst_object);
8979
8980   setup_coding_system (coding_system, &coding);
8981   coding.mode |= CODING_MODE_LAST_BLOCK;
8982   chars = SCHARS (string);
8983   bytes = SBYTES (string);
8984   if (encodep)
8985     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8986   else
8987     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8988   if (! norecord)
8989     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8990
8991   return (BUFFERP (dst_object)
8992           ? make_number (coding.produced_char)
8993           : coding.dst_object);
8994 }
8995
8996
8997 /* Encode or decode STRING according to CODING_SYSTEM.
8998    Do not set Vlast_coding_system_used.
8999
9000    This function is called only from macros DECODE_FILE and
9001    ENCODE_FILE, thus we ignore character composition.  */
9002
9003 Lisp_Object
9004 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9005                               bool encodep)
9006 {
9007   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9008 }
9009
9010
9011 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9012        2, 4, 0,
9013        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9014
9015 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9016 if the decoding operation is trivial.
9017
9018 Optional fourth arg BUFFER non-nil means that the decoded text is
9019 inserted in that buffer after point (point does not move).  In this
9020 case, the return value is the length of the decoded text.
9021
9022 This function sets `last-coding-system-used' to the precise coding system
9023 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9024 not fully specified.)  */)
9025   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9026 {
9027   return code_convert_string (string, coding_system, buffer,
9028                               0, ! NILP (nocopy), 0);
9029 }
9030
9031 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9032        2, 4, 0,
9033        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9034
9035 Optional third arg NOCOPY non-nil means it is OK to return STRING
9036 itself if the encoding operation is trivial.
9037
9038 Optional fourth arg BUFFER non-nil means that the encoded text is
9039 inserted in that buffer after point (point does not move).  In this
9040 case, the return value is the length of the encoded text.
9041
9042 This function sets `last-coding-system-used' to the precise coding system
9043 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9044 not fully specified.)  */)
9045   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9046 {
9047   return code_convert_string (string, coding_system, buffer,
9048                               1, ! NILP (nocopy), 0);
9049 }
9050
9051 \f
9052 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9053        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9054 Return the corresponding character.  */)
9055   (Lisp_Object code)
9056 {
9057   Lisp_Object spec, attrs, val;
9058   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9059   EMACS_INT ch;
9060   int c;
9061
9062   CHECK_NATNUM (code);
9063   ch = XFASTINT (code);
9064   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9065   attrs = AREF (spec, 0);
9066
9067   if (ASCII_BYTE_P (ch)
9068       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9069     return code;
9070
9071   val = CODING_ATTR_CHARSET_LIST (attrs);
9072   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9073   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9074   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9075
9076   if (ch <= 0x7F)
9077     {
9078       c = ch;
9079       charset = charset_roman;
9080     }
9081   else if (ch >= 0xA0 && ch < 0xDF)
9082     {
9083       c = ch - 0x80;
9084       charset = charset_kana;
9085     }
9086   else
9087     {
9088       EMACS_INT c1 = ch >> 8;
9089       int c2 = ch & 0xFF;
9090
9091       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9092           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9093         error ("Invalid code: %"pI"d", ch);
9094       c = ch;
9095       SJIS_TO_JIS (c);
9096       charset = charset_kanji;
9097     }
9098   c = DECODE_CHAR (charset, c);
9099   if (c < 0)
9100     error ("Invalid code: %"pI"d", ch);
9101   return make_number (c);
9102 }
9103
9104
9105 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9106        doc: /* Encode a Japanese character CH to shift_jis encoding.
9107 Return the corresponding code in SJIS.  */)
9108   (Lisp_Object ch)
9109 {
9110   Lisp_Object spec, attrs, charset_list;
9111   int c;
9112   struct charset *charset;
9113   unsigned code;
9114
9115   CHECK_CHARACTER (ch);
9116   c = XFASTINT (ch);
9117   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9118   attrs = AREF (spec, 0);
9119
9120   if (ASCII_CHAR_P (c)
9121       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9122     return ch;
9123
9124   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9125   charset = char_charset (c, charset_list, &code);
9126   if (code == CHARSET_INVALID_CODE (charset))
9127     error ("Can't encode by shift_jis encoding: %c", c);
9128   JIS_TO_SJIS (code);
9129
9130   return make_number (code);
9131 }
9132
9133 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9134        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9135 Return the corresponding character.  */)
9136   (Lisp_Object code)
9137 {
9138   Lisp_Object spec, attrs, val;
9139   struct charset *charset_roman, *charset_big5, *charset;
9140   EMACS_INT ch;
9141   int c;
9142
9143   CHECK_NATNUM (code);
9144   ch = XFASTINT (code);
9145   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9146   attrs = AREF (spec, 0);
9147
9148   if (ASCII_BYTE_P (ch)
9149       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9150     return code;
9151
9152   val = CODING_ATTR_CHARSET_LIST (attrs);
9153   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9154   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9155
9156   if (ch <= 0x7F)
9157     {
9158       c = ch;
9159       charset = charset_roman;
9160     }
9161   else
9162     {
9163       EMACS_INT b1 = ch >> 8;
9164       int b2 = ch & 0x7F;
9165       if (b1 < 0xA1 || b1 > 0xFE
9166           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9167         error ("Invalid code: %"pI"d", ch);
9168       c = ch;
9169       charset = charset_big5;
9170     }
9171   c = DECODE_CHAR (charset, c);
9172   if (c < 0)
9173     error ("Invalid code: %"pI"d", ch);
9174   return make_number (c);
9175 }
9176
9177 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9178        doc: /* Encode the Big5 character CH to BIG5 coding system.
9179 Return the corresponding character code in Big5.  */)
9180   (Lisp_Object ch)
9181 {
9182   Lisp_Object spec, attrs, charset_list;
9183   struct charset *charset;
9184   int c;
9185   unsigned code;
9186
9187   CHECK_CHARACTER (ch);
9188   c = XFASTINT (ch);
9189   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9190   attrs = AREF (spec, 0);
9191   if (ASCII_CHAR_P (c)
9192       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9193     return ch;
9194
9195   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9196   charset = char_charset (c, charset_list, &code);
9197   if (code == CHARSET_INVALID_CODE (charset))
9198     error ("Can't encode by Big5 encoding: %c", c);
9199
9200   return make_number (code);
9201 }
9202
9203 \f
9204 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9205        Sset_terminal_coding_system_internal, 1, 2, 0,
9206        doc: /* Internal use only.  */)
9207   (Lisp_Object coding_system, Lisp_Object terminal)
9208 {
9209   struct terminal *term = get_terminal (terminal, 1);
9210   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9211   CHECK_SYMBOL (coding_system);
9212   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9213   /* We had better not send unsafe characters to terminal.  */
9214   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9215   /* Character composition should be disabled.  */
9216   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9217   terminal_coding->src_multibyte = 1;
9218   terminal_coding->dst_multibyte = 0;
9219   tset_charset_list
9220     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9221             ? coding_charset_list (terminal_coding)
9222             : Fcons (make_number (charset_ascii), Qnil)));
9223   return Qnil;
9224 }
9225
9226 DEFUN ("set-safe-terminal-coding-system-internal",
9227        Fset_safe_terminal_coding_system_internal,
9228        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9229        doc: /* Internal use only.  */)
9230   (Lisp_Object coding_system)
9231 {
9232   CHECK_SYMBOL (coding_system);
9233   setup_coding_system (Fcheck_coding_system (coding_system),
9234                        &safe_terminal_coding);
9235   /* Character composition should be disabled.  */
9236   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9237   safe_terminal_coding.src_multibyte = 1;
9238   safe_terminal_coding.dst_multibyte = 0;
9239   return Qnil;
9240 }
9241
9242 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9243        Sterminal_coding_system, 0, 1, 0,
9244        doc: /* Return coding system specified for terminal output on the given terminal.
9245 TERMINAL may be a terminal object, a frame, or nil for the selected
9246 frame's terminal device.  */)
9247   (Lisp_Object terminal)
9248 {
9249   struct coding_system *terminal_coding
9250     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9251   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9252
9253   /* For backward compatibility, return nil if it is `undecided'.  */
9254   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9255 }
9256
9257 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9258        Sset_keyboard_coding_system_internal, 1, 2, 0,
9259        doc: /* Internal use only.  */)
9260   (Lisp_Object coding_system, Lisp_Object terminal)
9261 {
9262   struct terminal *t = get_terminal (terminal, 1);
9263   CHECK_SYMBOL (coding_system);
9264   if (NILP (coding_system))
9265     coding_system = Qno_conversion;
9266   else
9267     Fcheck_coding_system (coding_system);
9268   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9269   /* Character composition should be disabled.  */
9270   TERMINAL_KEYBOARD_CODING (t)->common_flags
9271     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9272   return Qnil;
9273 }
9274
9275 DEFUN ("keyboard-coding-system",
9276        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9277        doc: /* Return coding system specified for decoding keyboard input.  */)
9278   (Lisp_Object terminal)
9279 {
9280   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9281                          (get_terminal (terminal, 1))->id);
9282 }
9283
9284 \f
9285 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9286        Sfind_operation_coding_system,  1, MANY, 0,
9287        doc: /* Choose a coding system for an operation based on the target name.
9288 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9289 DECODING-SYSTEM is the coding system to use for decoding
9290 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9291 for encoding (in case OPERATION does encoding).
9292
9293 The first argument OPERATION specifies an I/O primitive:
9294   For file I/O, `insert-file-contents' or `write-region'.
9295   For process I/O, `call-process', `call-process-region', or `start-process'.
9296   For network I/O, `open-network-stream'.
9297
9298 The remaining arguments should be the same arguments that were passed
9299 to the primitive.  Depending on which primitive, one of those arguments
9300 is selected as the TARGET.  For example, if OPERATION does file I/O,
9301 whichever argument specifies the file name is TARGET.
9302
9303 TARGET has a meaning which depends on OPERATION:
9304   For file I/O, TARGET is a file name (except for the special case below).
9305   For process I/O, TARGET is a process name.
9306   For network I/O, TARGET is a service name or a port number.
9307
9308 This function looks up what is specified for TARGET in
9309 `file-coding-system-alist', `process-coding-system-alist',
9310 or `network-coding-system-alist' depending on OPERATION.
9311 They may specify a coding system, a cons of coding systems,
9312 or a function symbol to call.
9313 In the last case, we call the function with one argument,
9314 which is a list of all the arguments given to this function.
9315 If the function can't decide a coding system, it can return
9316 `undecided' so that the normal code-detection is performed.
9317
9318 If OPERATION is `insert-file-contents', the argument corresponding to
9319 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9320 file name to look up, and BUFFER is a buffer that contains the file's
9321 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9322 function to call for FILENAME, that function should examine the
9323 contents of BUFFER instead of reading the file.
9324
9325 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9326   (ptrdiff_t nargs, Lisp_Object *args)
9327 {
9328   Lisp_Object operation, target_idx, target, val;
9329   register Lisp_Object chain;
9330
9331   if (nargs < 2)
9332     error ("Too few arguments");
9333   operation = args[0];
9334   if (!SYMBOLP (operation)
9335       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9336     error ("Invalid first argument");
9337   if (nargs <= 1 + XFASTINT (target_idx))
9338     error ("Too few arguments for operation `%s'",
9339            SDATA (SYMBOL_NAME (operation)));
9340   target = args[XFASTINT (target_idx) + 1];
9341   if (!(STRINGP (target)
9342         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9343             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9344         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9345     error ("Invalid argument %"pI"d of operation `%s'",
9346            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9347   if (CONSP (target))
9348     target = XCAR (target);
9349
9350   chain = ((EQ (operation, Qinsert_file_contents)
9351             || EQ (operation, Qwrite_region))
9352            ? Vfile_coding_system_alist
9353            : (EQ (operation, Qopen_network_stream)
9354               ? Vnetwork_coding_system_alist
9355               : Vprocess_coding_system_alist));
9356   if (NILP (chain))
9357     return Qnil;
9358
9359   for (; CONSP (chain); chain = XCDR (chain))
9360     {
9361       Lisp_Object elt;
9362
9363       elt = XCAR (chain);
9364       if (CONSP (elt)
9365           && ((STRINGP (target)
9366                && STRINGP (XCAR (elt))
9367                && fast_string_match (XCAR (elt), target) >= 0)
9368               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9369         {
9370           val = XCDR (elt);
9371           /* Here, if VAL is both a valid coding system and a valid
9372              function symbol, we return VAL as a coding system.  */
9373           if (CONSP (val))
9374             return val;
9375           if (! SYMBOLP (val))
9376             return Qnil;
9377           if (! NILP (Fcoding_system_p (val)))
9378             return Fcons (val, val);
9379           if (! NILP (Ffboundp (val)))
9380             {
9381               /* We use call1 rather than safe_call1
9382                  so as to get bug reports about functions called here
9383                  which don't handle the current interface.  */
9384               val = call1 (val, Flist (nargs, args));
9385               if (CONSP (val))
9386                 return val;
9387               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9388                 return Fcons (val, val);
9389             }
9390           return Qnil;
9391         }
9392     }
9393   return Qnil;
9394 }
9395
9396 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9397        Sset_coding_system_priority, 0, MANY, 0,
9398        doc: /* Assign higher priority to the coding systems given as arguments.
9399 If multiple coding systems belong to the same category,
9400 all but the first one are ignored.
9401
9402 usage: (set-coding-system-priority &rest coding-systems)  */)
9403   (ptrdiff_t nargs, Lisp_Object *args)
9404 {
9405   ptrdiff_t i, j;
9406   bool changed[coding_category_max];
9407   enum coding_category priorities[coding_category_max];
9408
9409   memset (changed, 0, sizeof changed);
9410
9411   for (i = j = 0; i < nargs; i++)
9412     {
9413       enum coding_category category;
9414       Lisp_Object spec, attrs;
9415
9416       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9417       attrs = AREF (spec, 0);
9418       category = XINT (CODING_ATTR_CATEGORY (attrs));
9419       if (changed[category])
9420         /* Ignore this coding system because a coding system of the
9421            same category already had a higher priority.  */
9422         continue;
9423       changed[category] = 1;
9424       priorities[j++] = category;
9425       if (coding_categories[category].id >= 0
9426           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9427         setup_coding_system (args[i], &coding_categories[category]);
9428       Fset (AREF (Vcoding_category_table, category), args[i]);
9429     }
9430
9431   /* Now we have decided top J priorities.  Reflect the order of the
9432      original priorities to the remaining priorities.  */
9433
9434   for (i = j, j = 0; i < coding_category_max; i++, j++)
9435     {
9436       while (j < coding_category_max
9437              && changed[coding_priorities[j]])
9438         j++;
9439       if (j == coding_category_max)
9440         emacs_abort ();
9441       priorities[i] = coding_priorities[j];
9442     }
9443
9444   memcpy (coding_priorities, priorities, sizeof priorities);
9445
9446   /* Update `coding-category-list'.  */
9447   Vcoding_category_list = Qnil;
9448   for (i = coding_category_max; i-- > 0; )
9449     Vcoding_category_list
9450       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9451                Vcoding_category_list);
9452
9453   return Qnil;
9454 }
9455
9456 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9457        Scoding_system_priority_list, 0, 1, 0,
9458        doc: /* Return a list of coding systems ordered by their priorities.
9459 The list contains a subset of coding systems; i.e. coding systems
9460 assigned to each coding category (see `coding-category-list').
9461
9462 HIGHESTP non-nil means just return the highest priority one.  */)
9463   (Lisp_Object highestp)
9464 {
9465   int i;
9466   Lisp_Object val;
9467
9468   for (i = 0, val = Qnil; i < coding_category_max; i++)
9469     {
9470       enum coding_category category = coding_priorities[i];
9471       int id = coding_categories[category].id;
9472       Lisp_Object attrs;
9473
9474       if (id < 0)
9475         continue;
9476       attrs = CODING_ID_ATTRS (id);
9477       if (! NILP (highestp))
9478         return CODING_ATTR_BASE_NAME (attrs);
9479       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9480     }
9481   return Fnreverse (val);
9482 }
9483
9484 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9485
9486 static Lisp_Object
9487 make_subsidiaries (Lisp_Object base)
9488 {
9489   Lisp_Object subsidiaries;
9490   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9491   char *buf = alloca (base_name_len + 6);
9492   int i;
9493
9494   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9495   subsidiaries = Fmake_vector (make_number (3), Qnil);
9496   for (i = 0; i < 3; i++)
9497     {
9498       strcpy (buf + base_name_len, suffixes[i]);
9499       ASET (subsidiaries, i, intern (buf));
9500     }
9501   return subsidiaries;
9502 }
9503
9504
9505 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9506        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9507        doc: /* For internal use only.
9508 usage: (define-coding-system-internal ...)  */)
9509   (ptrdiff_t nargs, Lisp_Object *args)
9510 {
9511   Lisp_Object name;
9512   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9513   Lisp_Object attrs;            /* Vector of attributes.  */
9514   Lisp_Object eol_type;
9515   Lisp_Object aliases;
9516   Lisp_Object coding_type, charset_list, safe_charsets;
9517   enum coding_category category;
9518   Lisp_Object tail, val;
9519   int max_charset_id = 0;
9520   int i;
9521
9522   if (nargs < coding_arg_max)
9523     goto short_args;
9524
9525   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9526
9527   name = args[coding_arg_name];
9528   CHECK_SYMBOL (name);
9529   ASET (attrs, coding_attr_base_name, name);
9530
9531   val = args[coding_arg_mnemonic];
9532   if (! STRINGP (val))
9533     CHECK_CHARACTER (val);
9534   ASET (attrs, coding_attr_mnemonic, val);
9535
9536   coding_type = args[coding_arg_coding_type];
9537   CHECK_SYMBOL (coding_type);
9538   ASET (attrs, coding_attr_type, coding_type);
9539
9540   charset_list = args[coding_arg_charset_list];
9541   if (SYMBOLP (charset_list))
9542     {
9543       if (EQ (charset_list, Qiso_2022))
9544         {
9545           if (! EQ (coding_type, Qiso_2022))
9546             error ("Invalid charset-list");
9547           charset_list = Viso_2022_charset_list;
9548         }
9549       else if (EQ (charset_list, Qemacs_mule))
9550         {
9551           if (! EQ (coding_type, Qemacs_mule))
9552             error ("Invalid charset-list");
9553           charset_list = Vemacs_mule_charset_list;
9554         }
9555       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9556         {
9557           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9558             error ("Invalid charset-list");
9559           if (max_charset_id < XFASTINT (XCAR (tail)))
9560             max_charset_id = XFASTINT (XCAR (tail));
9561         }
9562     }
9563   else
9564     {
9565       charset_list = Fcopy_sequence (charset_list);
9566       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9567         {
9568           struct charset *charset;
9569
9570           val = XCAR (tail);
9571           CHECK_CHARSET_GET_CHARSET (val, charset);
9572           if (EQ (coding_type, Qiso_2022)
9573               ? CHARSET_ISO_FINAL (charset) < 0
9574               : EQ (coding_type, Qemacs_mule)
9575               ? CHARSET_EMACS_MULE_ID (charset) < 0
9576               : 0)
9577             error ("Can't handle charset `%s'",
9578                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9579
9580           XSETCAR (tail, make_number (charset->id));
9581           if (max_charset_id < charset->id)
9582             max_charset_id = charset->id;
9583         }
9584     }
9585   ASET (attrs, coding_attr_charset_list, charset_list);
9586
9587   safe_charsets = make_uninit_string (max_charset_id + 1);
9588   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9589   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9590     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9591   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9592
9593   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9594
9595   val = args[coding_arg_decode_translation_table];
9596   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9597     CHECK_SYMBOL (val);
9598   ASET (attrs, coding_attr_decode_tbl, val);
9599
9600   val = args[coding_arg_encode_translation_table];
9601   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9602     CHECK_SYMBOL (val);
9603   ASET (attrs, coding_attr_encode_tbl, val);
9604
9605   val = args[coding_arg_post_read_conversion];
9606   CHECK_SYMBOL (val);
9607   ASET (attrs, coding_attr_post_read, val);
9608
9609   val = args[coding_arg_pre_write_conversion];
9610   CHECK_SYMBOL (val);
9611   ASET (attrs, coding_attr_pre_write, val);
9612
9613   val = args[coding_arg_default_char];
9614   if (NILP (val))
9615     ASET (attrs, coding_attr_default_char, make_number (' '));
9616   else
9617     {
9618       CHECK_CHARACTER (val);
9619       ASET (attrs, coding_attr_default_char, val);
9620     }
9621
9622   val = args[coding_arg_for_unibyte];
9623   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9624
9625   val = args[coding_arg_plist];
9626   CHECK_LIST (val);
9627   ASET (attrs, coding_attr_plist, val);
9628
9629   if (EQ (coding_type, Qcharset))
9630     {
9631       /* Generate a lisp vector of 256 elements.  Each element is nil,
9632          integer, or a list of charset IDs.
9633
9634          If Nth element is nil, the byte code N is invalid in this
9635          coding system.
9636
9637          If Nth element is a number NUM, N is the first byte of a
9638          charset whose ID is NUM.
9639
9640          If Nth element is a list of charset IDs, N is the first byte
9641          of one of them.  The list is sorted by dimensions of the
9642          charsets.  A charset of smaller dimension comes first. */
9643       val = Fmake_vector (make_number (256), Qnil);
9644
9645       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9646         {
9647           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9648           int dim = CHARSET_DIMENSION (charset);
9649           int idx = (dim - 1) * 4;
9650
9651           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9652             ASET (attrs, coding_attr_ascii_compat, Qt);
9653
9654           for (i = charset->code_space[idx];
9655                i <= charset->code_space[idx + 1]; i++)
9656             {
9657               Lisp_Object tmp, tmp2;
9658               int dim2;
9659
9660               tmp = AREF (val, i);
9661               if (NILP (tmp))
9662                 tmp = XCAR (tail);
9663               else if (NUMBERP (tmp))
9664                 {
9665                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9666                   if (dim < dim2)
9667                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9668                   else
9669                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9670                 }
9671               else
9672                 {
9673                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9674                     {
9675                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9676                       if (dim < dim2)
9677                         break;
9678                     }
9679                   if (NILP (tmp2))
9680                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9681                   else
9682                     {
9683                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9684                       XSETCAR (tmp2, XCAR (tail));
9685                     }
9686                 }
9687               ASET (val, i, tmp);
9688             }
9689         }
9690       ASET (attrs, coding_attr_charset_valids, val);
9691       category = coding_category_charset;
9692     }
9693   else if (EQ (coding_type, Qccl))
9694     {
9695       Lisp_Object valids;
9696
9697       if (nargs < coding_arg_ccl_max)
9698         goto short_args;
9699
9700       val = args[coding_arg_ccl_decoder];
9701       CHECK_CCL_PROGRAM (val);
9702       if (VECTORP (val))
9703         val = Fcopy_sequence (val);
9704       ASET (attrs, coding_attr_ccl_decoder, val);
9705
9706       val = args[coding_arg_ccl_encoder];
9707       CHECK_CCL_PROGRAM (val);
9708       if (VECTORP (val))
9709         val = Fcopy_sequence (val);
9710       ASET (attrs, coding_attr_ccl_encoder, val);
9711
9712       val = args[coding_arg_ccl_valids];
9713       valids = Fmake_string (make_number (256), make_number (0));
9714       for (tail = val; CONSP (tail); tail = XCDR (tail))
9715         {
9716           int from, to;
9717
9718           val = XCAR (tail);
9719           if (INTEGERP (val))
9720             {
9721               if (! (0 <= XINT (val) && XINT (val) <= 255))
9722                 args_out_of_range_3 (val, make_number (0), make_number (255));
9723               from = to = XINT (val);
9724             }
9725           else
9726             {
9727               CHECK_CONS (val);
9728               CHECK_NATNUM_CAR (val);
9729               CHECK_NUMBER_CDR (val);
9730               if (XINT (XCAR (val)) > 255)
9731                 args_out_of_range_3 (XCAR (val),
9732                                      make_number (0), make_number (255));
9733               from = XINT (XCAR (val));
9734               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9735                 args_out_of_range_3 (XCDR (val),
9736                                      XCAR (val), make_number (255));
9737               to = XINT (XCDR (val));
9738             }
9739           for (i = from; i <= to; i++)
9740             SSET (valids, i, 1);
9741         }
9742       ASET (attrs, coding_attr_ccl_valids, valids);
9743
9744       category = coding_category_ccl;
9745     }
9746   else if (EQ (coding_type, Qutf_16))
9747     {
9748       Lisp_Object bom, endian;
9749
9750       ASET (attrs, coding_attr_ascii_compat, Qnil);
9751
9752       if (nargs < coding_arg_utf16_max)
9753         goto short_args;
9754
9755       bom = args[coding_arg_utf16_bom];
9756       if (! NILP (bom) && ! EQ (bom, Qt))
9757         {
9758           CHECK_CONS (bom);
9759           val = XCAR (bom);
9760           CHECK_CODING_SYSTEM (val);
9761           val = XCDR (bom);
9762           CHECK_CODING_SYSTEM (val);
9763         }
9764       ASET (attrs, coding_attr_utf_bom, bom);
9765
9766       endian = args[coding_arg_utf16_endian];
9767       CHECK_SYMBOL (endian);
9768       if (NILP (endian))
9769         endian = Qbig;
9770       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9771         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9772       ASET (attrs, coding_attr_utf_16_endian, endian);
9773
9774       category = (CONSP (bom)
9775                   ? coding_category_utf_16_auto
9776                   : NILP (bom)
9777                   ? (EQ (endian, Qbig)
9778                      ? coding_category_utf_16_be_nosig
9779                      : coding_category_utf_16_le_nosig)
9780                   : (EQ (endian, Qbig)
9781                      ? coding_category_utf_16_be
9782                      : coding_category_utf_16_le));
9783     }
9784   else if (EQ (coding_type, Qiso_2022))
9785     {
9786       Lisp_Object initial, reg_usage, request, flags;
9787
9788       if (nargs < coding_arg_iso2022_max)
9789         goto short_args;
9790
9791       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9792       CHECK_VECTOR (initial);
9793       for (i = 0; i < 4; i++)
9794         {
9795           val = Faref (initial, make_number (i));
9796           if (! NILP (val))
9797             {
9798               struct charset *charset;
9799
9800               CHECK_CHARSET_GET_CHARSET (val, charset);
9801               ASET (initial, i, make_number (CHARSET_ID (charset)));
9802               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9803                 ASET (attrs, coding_attr_ascii_compat, Qt);
9804             }
9805           else
9806             ASET (initial, i, make_number (-1));
9807         }
9808
9809       reg_usage = args[coding_arg_iso2022_reg_usage];
9810       CHECK_CONS (reg_usage);
9811       CHECK_NUMBER_CAR (reg_usage);
9812       CHECK_NUMBER_CDR (reg_usage);
9813
9814       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9815       for (tail = request; CONSP (tail); tail = XCDR (tail))
9816         {
9817           int id;
9818           Lisp_Object tmp1;
9819
9820           val = XCAR (tail);
9821           CHECK_CONS (val);
9822           tmp1 = XCAR (val);
9823           CHECK_CHARSET_GET_ID (tmp1, id);
9824           CHECK_NATNUM_CDR (val);
9825           if (XINT (XCDR (val)) >= 4)
9826             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9827           XSETCAR (val, make_number (id));
9828         }
9829
9830       flags = args[coding_arg_iso2022_flags];
9831       CHECK_NATNUM (flags);
9832       i = XINT (flags) & INT_MAX;
9833       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9834         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9835       flags = make_number (i);
9836
9837       ASET (attrs, coding_attr_iso_initial, initial);
9838       ASET (attrs, coding_attr_iso_usage, reg_usage);
9839       ASET (attrs, coding_attr_iso_request, request);
9840       ASET (attrs, coding_attr_iso_flags, flags);
9841       setup_iso_safe_charsets (attrs);
9842
9843       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9844         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9845                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9846                     ? coding_category_iso_7_else
9847                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9848                     ? coding_category_iso_7
9849                     : coding_category_iso_7_tight);
9850       else
9851         {
9852           int id = XINT (AREF (initial, 1));
9853
9854           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9855                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9856                        || id < 0)
9857                       ? coding_category_iso_8_else
9858                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9859                       ? coding_category_iso_8_1
9860                       : coding_category_iso_8_2);
9861         }
9862       if (category != coding_category_iso_8_1
9863           && category != coding_category_iso_8_2)
9864         ASET (attrs, coding_attr_ascii_compat, Qnil);
9865     }
9866   else if (EQ (coding_type, Qemacs_mule))
9867     {
9868       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9869         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9870       ASET (attrs, coding_attr_ascii_compat, Qt);
9871       category = coding_category_emacs_mule;
9872     }
9873   else if (EQ (coding_type, Qshift_jis))
9874     {
9875
9876       struct charset *charset;
9877
9878       if (XINT (Flength (charset_list)) != 3
9879           && XINT (Flength (charset_list)) != 4)
9880         error ("There should be three or four charsets");
9881
9882       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9883       if (CHARSET_DIMENSION (charset) != 1)
9884         error ("Dimension of charset %s is not one",
9885                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9886       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9887         ASET (attrs, coding_attr_ascii_compat, Qt);
9888
9889       charset_list = XCDR (charset_list);
9890       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9891       if (CHARSET_DIMENSION (charset) != 1)
9892         error ("Dimension of charset %s is not one",
9893                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9894
9895       charset_list = XCDR (charset_list);
9896       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9897       if (CHARSET_DIMENSION (charset) != 2)
9898         error ("Dimension of charset %s is not two",
9899                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9900
9901       charset_list = XCDR (charset_list);
9902       if (! NILP (charset_list))
9903         {
9904           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9905           if (CHARSET_DIMENSION (charset) != 2)
9906             error ("Dimension of charset %s is not two",
9907                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9908         }
9909
9910       category = coding_category_sjis;
9911       Vsjis_coding_system = name;
9912     }
9913   else if (EQ (coding_type, Qbig5))
9914     {
9915       struct charset *charset;
9916
9917       if (XINT (Flength (charset_list)) != 2)
9918         error ("There should be just two charsets");
9919
9920       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9921       if (CHARSET_DIMENSION (charset) != 1)
9922         error ("Dimension of charset %s is not one",
9923                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9924       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9925         ASET (attrs, coding_attr_ascii_compat, Qt);
9926
9927       charset_list = XCDR (charset_list);
9928       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9929       if (CHARSET_DIMENSION (charset) != 2)
9930         error ("Dimension of charset %s is not two",
9931                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9932
9933       category = coding_category_big5;
9934       Vbig5_coding_system = name;
9935     }
9936   else if (EQ (coding_type, Qraw_text))
9937     {
9938       category = coding_category_raw_text;
9939       ASET (attrs, coding_attr_ascii_compat, Qt);
9940     }
9941   else if (EQ (coding_type, Qutf_8))
9942     {
9943       Lisp_Object bom;
9944
9945       if (nargs < coding_arg_utf8_max)
9946         goto short_args;
9947
9948       bom = args[coding_arg_utf8_bom];
9949       if (! NILP (bom) && ! EQ (bom, Qt))
9950         {
9951           CHECK_CONS (bom);
9952           val = XCAR (bom);
9953           CHECK_CODING_SYSTEM (val);
9954           val = XCDR (bom);
9955           CHECK_CODING_SYSTEM (val);
9956         }
9957       ASET (attrs, coding_attr_utf_bom, bom);
9958       if (NILP (bom))
9959         ASET (attrs, coding_attr_ascii_compat, Qt);
9960
9961       category = (CONSP (bom) ? coding_category_utf_8_auto
9962                   : NILP (bom) ? coding_category_utf_8_nosig
9963                   : coding_category_utf_8_sig);
9964     }
9965   else if (EQ (coding_type, Qundecided))
9966     category = coding_category_undecided;
9967   else
9968     error ("Invalid coding system type: %s",
9969            SDATA (SYMBOL_NAME (coding_type)));
9970
9971   ASET (attrs, coding_attr_category, make_number (category));
9972   ASET (attrs, coding_attr_plist,
9973         Fcons (QCcategory,
9974                Fcons (AREF (Vcoding_category_table, category),
9975                       CODING_ATTR_PLIST (attrs))));
9976   ASET (attrs, coding_attr_plist,
9977         Fcons (QCascii_compatible_p,
9978                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9979                       CODING_ATTR_PLIST (attrs))));
9980
9981   eol_type = args[coding_arg_eol_type];
9982   if (! NILP (eol_type)
9983       && ! EQ (eol_type, Qunix)
9984       && ! EQ (eol_type, Qdos)
9985       && ! EQ (eol_type, Qmac))
9986     error ("Invalid eol-type");
9987
9988   aliases = Fcons (name, Qnil);
9989
9990   if (NILP (eol_type))
9991     {
9992       eol_type = make_subsidiaries (name);
9993       for (i = 0; i < 3; i++)
9994         {
9995           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9996
9997           this_name = AREF (eol_type, i);
9998           this_aliases = Fcons (this_name, Qnil);
9999           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10000           this_spec = Fmake_vector (make_number (3), attrs);
10001           ASET (this_spec, 1, this_aliases);
10002           ASET (this_spec, 2, this_eol_type);
10003           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10004           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10005           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10006           if (NILP (val))
10007             Vcoding_system_alist
10008               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10009                        Vcoding_system_alist);
10010         }
10011     }
10012
10013   spec_vec = Fmake_vector (make_number (3), attrs);
10014   ASET (spec_vec, 1, aliases);
10015   ASET (spec_vec, 2, eol_type);
10016
10017   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10018   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10019   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10020   if (NILP (val))
10021     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10022                                   Vcoding_system_alist);
10023
10024   {
10025     int id = coding_categories[category].id;
10026
10027     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10028       setup_coding_system (name, &coding_categories[category]);
10029   }
10030
10031   return Qnil;
10032
10033  short_args:
10034   return Fsignal (Qwrong_number_of_arguments,
10035                   Fcons (intern ("define-coding-system-internal"),
10036                          make_number (nargs)));
10037 }
10038
10039
10040 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10041        3, 3, 0,
10042        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10043   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10044 {
10045   Lisp_Object spec, attrs;
10046
10047   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10048   attrs = AREF (spec, 0);
10049   if (EQ (prop, QCmnemonic))
10050     {
10051       if (! STRINGP (val))
10052         CHECK_CHARACTER (val);
10053       ASET (attrs, coding_attr_mnemonic, val);
10054     }
10055   else if (EQ (prop, QCdefault_char))
10056     {
10057       if (NILP (val))
10058         val = make_number (' ');
10059       else
10060         CHECK_CHARACTER (val);
10061       ASET (attrs, coding_attr_default_char, val);
10062     }
10063   else if (EQ (prop, QCdecode_translation_table))
10064     {
10065       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10066         CHECK_SYMBOL (val);
10067       ASET (attrs, coding_attr_decode_tbl, val);
10068     }
10069   else if (EQ (prop, QCencode_translation_table))
10070     {
10071       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10072         CHECK_SYMBOL (val);
10073       ASET (attrs, coding_attr_encode_tbl, val);
10074     }
10075   else if (EQ (prop, QCpost_read_conversion))
10076     {
10077       CHECK_SYMBOL (val);
10078       ASET (attrs, coding_attr_post_read, val);
10079     }
10080   else if (EQ (prop, QCpre_write_conversion))
10081     {
10082       CHECK_SYMBOL (val);
10083       ASET (attrs, coding_attr_pre_write, val);
10084     }
10085   else if (EQ (prop, QCascii_compatible_p))
10086     {
10087       ASET (attrs, coding_attr_ascii_compat, val);
10088     }
10089
10090   ASET (attrs, coding_attr_plist,
10091         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10092   return val;
10093 }
10094
10095
10096 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10097        Sdefine_coding_system_alias, 2, 2, 0,
10098        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10099   (Lisp_Object alias, Lisp_Object coding_system)
10100 {
10101   Lisp_Object spec, aliases, eol_type, val;
10102
10103   CHECK_SYMBOL (alias);
10104   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10105   aliases = AREF (spec, 1);
10106   /* ALIASES should be a list of length more than zero, and the first
10107      element is a base coding system.  Append ALIAS at the tail of the
10108      list.  */
10109   while (!NILP (XCDR (aliases)))
10110     aliases = XCDR (aliases);
10111   XSETCDR (aliases, Fcons (alias, Qnil));
10112
10113   eol_type = AREF (spec, 2);
10114   if (VECTORP (eol_type))
10115     {
10116       Lisp_Object subsidiaries;
10117       int i;
10118
10119       subsidiaries = make_subsidiaries (alias);
10120       for (i = 0; i < 3; i++)
10121         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10122                                      AREF (eol_type, i));
10123     }
10124
10125   Fputhash (alias, spec, Vcoding_system_hash_table);
10126   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10127   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10128   if (NILP (val))
10129     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10130                                   Vcoding_system_alist);
10131
10132   return Qnil;
10133 }
10134
10135 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10136        1, 1, 0,
10137        doc: /* Return the base of CODING-SYSTEM.
10138 Any alias or subsidiary coding system is not a base coding system.  */)
10139   (Lisp_Object coding_system)
10140 {
10141   Lisp_Object spec, attrs;
10142
10143   if (NILP (coding_system))
10144     return (Qno_conversion);
10145   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10146   attrs = AREF (spec, 0);
10147   return CODING_ATTR_BASE_NAME (attrs);
10148 }
10149
10150 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10151        1, 1, 0,
10152        doc: "Return the property list of CODING-SYSTEM.")
10153   (Lisp_Object coding_system)
10154 {
10155   Lisp_Object spec, attrs;
10156
10157   if (NILP (coding_system))
10158     coding_system = Qno_conversion;
10159   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10160   attrs = AREF (spec, 0);
10161   return CODING_ATTR_PLIST (attrs);
10162 }
10163
10164
10165 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10166        1, 1, 0,
10167        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10168   (Lisp_Object coding_system)
10169 {
10170   Lisp_Object spec;
10171
10172   if (NILP (coding_system))
10173     coding_system = Qno_conversion;
10174   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10175   return AREF (spec, 1);
10176 }
10177
10178 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10179        Scoding_system_eol_type, 1, 1, 0,
10180        doc: /* Return eol-type of CODING-SYSTEM.
10181 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10182
10183 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10184 and CR respectively.
10185
10186 A vector value indicates that a format of end-of-line should be
10187 detected automatically.  Nth element of the vector is the subsidiary
10188 coding system whose eol-type is N.  */)
10189   (Lisp_Object coding_system)
10190 {
10191   Lisp_Object spec, eol_type;
10192   int n;
10193
10194   if (NILP (coding_system))
10195     coding_system = Qno_conversion;
10196   if (! CODING_SYSTEM_P (coding_system))
10197     return Qnil;
10198   spec = CODING_SYSTEM_SPEC (coding_system);
10199   eol_type = AREF (spec, 2);
10200   if (VECTORP (eol_type))
10201     return Fcopy_sequence (eol_type);
10202   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10203   return make_number (n);
10204 }
10205
10206 #endif /* emacs */
10207
10208 \f
10209 /*** 9. Post-amble ***/
10210
10211 void
10212 init_coding_once (void)
10213 {
10214   int i;
10215
10216   for (i = 0; i < coding_category_max; i++)
10217     {
10218       coding_categories[i].id = -1;
10219       coding_priorities[i] = i;
10220     }
10221
10222   /* ISO2022 specific initialize routine.  */
10223   for (i = 0; i < 0x20; i++)
10224     iso_code_class[i] = ISO_control_0;
10225   for (i = 0x21; i < 0x7F; i++)
10226     iso_code_class[i] = ISO_graphic_plane_0;
10227   for (i = 0x80; i < 0xA0; i++)
10228     iso_code_class[i] = ISO_control_1;
10229   for (i = 0xA1; i < 0xFF; i++)
10230     iso_code_class[i] = ISO_graphic_plane_1;
10231   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10232   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10233   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10234   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10235   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10236   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10237   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10238   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10239   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10240
10241   for (i = 0; i < 256; i++)
10242     {
10243       emacs_mule_bytes[i] = 1;
10244     }
10245   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10246   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10247   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10248   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10249 }
10250
10251 #ifdef emacs
10252
10253 void
10254 syms_of_coding (void)
10255 {
10256   staticpro (&Vcoding_system_hash_table);
10257   {
10258     Lisp_Object args[2];
10259     args[0] = QCtest;
10260     args[1] = Qeq;
10261     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10262   }
10263
10264   staticpro (&Vsjis_coding_system);
10265   Vsjis_coding_system = Qnil;
10266
10267   staticpro (&Vbig5_coding_system);
10268   Vbig5_coding_system = Qnil;
10269
10270   staticpro (&Vcode_conversion_reused_workbuf);
10271   Vcode_conversion_reused_workbuf = Qnil;
10272
10273   staticpro (&Vcode_conversion_workbuf_name);
10274   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10275
10276   reused_workbuf_in_use = 0;
10277
10278   DEFSYM (Qcharset, "charset");
10279   DEFSYM (Qtarget_idx, "target-idx");
10280   DEFSYM (Qcoding_system_history, "coding-system-history");
10281   Fset (Qcoding_system_history, Qnil);
10282
10283   /* Target FILENAME is the first argument.  */
10284   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10285   /* Target FILENAME is the third argument.  */
10286   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10287
10288   DEFSYM (Qcall_process, "call-process");
10289   /* Target PROGRAM is the first argument.  */
10290   Fput (Qcall_process, Qtarget_idx, make_number (0));
10291
10292   DEFSYM (Qcall_process_region, "call-process-region");
10293   /* Target PROGRAM is the third argument.  */
10294   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10295
10296   DEFSYM (Qstart_process, "start-process");
10297   /* Target PROGRAM is the third argument.  */
10298   Fput (Qstart_process, Qtarget_idx, make_number (2));
10299
10300   DEFSYM (Qopen_network_stream, "open-network-stream");
10301   /* Target SERVICE is the fourth argument.  */
10302   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10303
10304   DEFSYM (Qcoding_system, "coding-system");
10305   DEFSYM (Qcoding_aliases, "coding-aliases");
10306
10307   DEFSYM (Qeol_type, "eol-type");
10308   DEFSYM (Qunix, "unix");
10309   DEFSYM (Qdos, "dos");
10310
10311   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10312   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10313   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10314   DEFSYM (Qdefault_char, "default-char");
10315   DEFSYM (Qundecided, "undecided");
10316   DEFSYM (Qno_conversion, "no-conversion");
10317   DEFSYM (Qraw_text, "raw-text");
10318
10319   DEFSYM (Qiso_2022, "iso-2022");
10320
10321   DEFSYM (Qutf_8, "utf-8");
10322   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10323
10324 #if defined (WINDOWSNT) || defined (HAVE_NTGUI)
10325   /* No, not utf-16-le: that one has a BOM.  */
10326   DEFSYM (Qutf_16le, "utf-16le");
10327 #endif
10328
10329   DEFSYM (Qutf_16, "utf-16");
10330   DEFSYM (Qbig, "big");
10331   DEFSYM (Qlittle, "little");
10332
10333   DEFSYM (Qshift_jis, "shift-jis");
10334   DEFSYM (Qbig5, "big5");
10335
10336   DEFSYM (Qcoding_system_p, "coding-system-p");
10337
10338   DEFSYM (Qcoding_system_error, "coding-system-error");
10339   Fput (Qcoding_system_error, Qerror_conditions,
10340         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10341   Fput (Qcoding_system_error, Qerror_message,
10342         build_pure_c_string ("Invalid coding system"));
10343
10344   /* Intern this now in case it isn't already done.
10345      Setting this variable twice is harmless.
10346      But don't staticpro it here--that is done in alloc.c.  */
10347   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10348
10349   DEFSYM (Qtranslation_table, "translation-table");
10350   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10351   DEFSYM (Qtranslation_table_id, "translation-table-id");
10352   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10353   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10354
10355   DEFSYM (Qvalid_codes, "valid-codes");
10356
10357   DEFSYM (Qemacs_mule, "emacs-mule");
10358
10359   DEFSYM (QCcategory, ":category");
10360   DEFSYM (QCmnemonic, ":mnemonic");
10361   DEFSYM (QCdefault_char, ":default-char");
10362   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10363   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10364   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10365   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10366   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10367
10368   Vcoding_category_table
10369     = Fmake_vector (make_number (coding_category_max), Qnil);
10370   staticpro (&Vcoding_category_table);
10371   /* Followings are target of code detection.  */
10372   ASET (Vcoding_category_table, coding_category_iso_7,
10373         intern_c_string ("coding-category-iso-7"));
10374   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10375         intern_c_string ("coding-category-iso-7-tight"));
10376   ASET (Vcoding_category_table, coding_category_iso_8_1,
10377         intern_c_string ("coding-category-iso-8-1"));
10378   ASET (Vcoding_category_table, coding_category_iso_8_2,
10379         intern_c_string ("coding-category-iso-8-2"));
10380   ASET (Vcoding_category_table, coding_category_iso_7_else,
10381         intern_c_string ("coding-category-iso-7-else"));
10382   ASET (Vcoding_category_table, coding_category_iso_8_else,
10383         intern_c_string ("coding-category-iso-8-else"));
10384   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10385         intern_c_string ("coding-category-utf-8-auto"));
10386   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10387         intern_c_string ("coding-category-utf-8"));
10388   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10389         intern_c_string ("coding-category-utf-8-sig"));
10390   ASET (Vcoding_category_table, coding_category_utf_16_be,
10391         intern_c_string ("coding-category-utf-16-be"));
10392   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10393         intern_c_string ("coding-category-utf-16-auto"));
10394   ASET (Vcoding_category_table, coding_category_utf_16_le,
10395         intern_c_string ("coding-category-utf-16-le"));
10396   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10397         intern_c_string ("coding-category-utf-16-be-nosig"));
10398   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10399         intern_c_string ("coding-category-utf-16-le-nosig"));
10400   ASET (Vcoding_category_table, coding_category_charset,
10401         intern_c_string ("coding-category-charset"));
10402   ASET (Vcoding_category_table, coding_category_sjis,
10403         intern_c_string ("coding-category-sjis"));
10404   ASET (Vcoding_category_table, coding_category_big5,
10405         intern_c_string ("coding-category-big5"));
10406   ASET (Vcoding_category_table, coding_category_ccl,
10407         intern_c_string ("coding-category-ccl"));
10408   ASET (Vcoding_category_table, coding_category_emacs_mule,
10409         intern_c_string ("coding-category-emacs-mule"));
10410   /* Followings are NOT target of code detection.  */
10411   ASET (Vcoding_category_table, coding_category_raw_text,
10412         intern_c_string ("coding-category-raw-text"));
10413   ASET (Vcoding_category_table, coding_category_undecided,
10414         intern_c_string ("coding-category-undecided"));
10415
10416   DEFSYM (Qinsufficient_source, "insufficient-source");
10417   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10418   DEFSYM (Qinvalid_source, "invalid-source");
10419   DEFSYM (Qinterrupted, "interrupted");
10420   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10421   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10422
10423   defsubr (&Scoding_system_p);
10424   defsubr (&Sread_coding_system);
10425   defsubr (&Sread_non_nil_coding_system);
10426   defsubr (&Scheck_coding_system);
10427   defsubr (&Sdetect_coding_region);
10428   defsubr (&Sdetect_coding_string);
10429   defsubr (&Sfind_coding_systems_region_internal);
10430   defsubr (&Sunencodable_char_position);
10431   defsubr (&Scheck_coding_systems_region);
10432   defsubr (&Sdecode_coding_region);
10433   defsubr (&Sencode_coding_region);
10434   defsubr (&Sdecode_coding_string);
10435   defsubr (&Sencode_coding_string);
10436   defsubr (&Sdecode_sjis_char);
10437   defsubr (&Sencode_sjis_char);
10438   defsubr (&Sdecode_big5_char);
10439   defsubr (&Sencode_big5_char);
10440   defsubr (&Sset_terminal_coding_system_internal);
10441   defsubr (&Sset_safe_terminal_coding_system_internal);
10442   defsubr (&Sterminal_coding_system);
10443   defsubr (&Sset_keyboard_coding_system_internal);
10444   defsubr (&Skeyboard_coding_system);
10445   defsubr (&Sfind_operation_coding_system);
10446   defsubr (&Sset_coding_system_priority);
10447   defsubr (&Sdefine_coding_system_internal);
10448   defsubr (&Sdefine_coding_system_alias);
10449   defsubr (&Scoding_system_put);
10450   defsubr (&Scoding_system_base);
10451   defsubr (&Scoding_system_plist);
10452   defsubr (&Scoding_system_aliases);
10453   defsubr (&Scoding_system_eol_type);
10454   defsubr (&Scoding_system_priority_list);
10455
10456   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10457                doc: /* List of coding systems.
10458
10459 Do not alter the value of this variable manually.  This variable should be
10460 updated by the functions `define-coding-system' and
10461 `define-coding-system-alias'.  */);
10462   Vcoding_system_list = Qnil;
10463
10464   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10465                doc: /* Alist of coding system names.
10466 Each element is one element list of coding system name.
10467 This variable is given to `completing-read' as COLLECTION argument.
10468
10469 Do not alter the value of this variable manually.  This variable should be
10470 updated by the functions `make-coding-system' and
10471 `define-coding-system-alias'.  */);
10472   Vcoding_system_alist = Qnil;
10473
10474   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10475                doc: /* List of coding-categories (symbols) ordered by priority.
10476
10477 On detecting a coding system, Emacs tries code detection algorithms
10478 associated with each coding-category one by one in this order.  When
10479 one algorithm agrees with a byte sequence of source text, the coding
10480 system bound to the corresponding coding-category is selected.
10481
10482 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10483   {
10484     int i;
10485
10486     Vcoding_category_list = Qnil;
10487     for (i = coding_category_max - 1; i >= 0; i--)
10488       Vcoding_category_list
10489         = Fcons (AREF (Vcoding_category_table, i),
10490                  Vcoding_category_list);
10491   }
10492
10493   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10494                doc: /* Specify the coding system for read operations.
10495 It is useful to bind this variable with `let', but do not set it globally.
10496 If the value is a coding system, it is used for decoding on read operation.
10497 If not, an appropriate element is used from one of the coding system alists.
10498 There are three such tables: `file-coding-system-alist',
10499 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10500   Vcoding_system_for_read = Qnil;
10501
10502   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10503                doc: /* Specify the coding system for write operations.
10504 Programs bind this variable with `let', but you should not set it globally.
10505 If the value is a coding system, it is used for encoding of output,
10506 when writing it to a file and when sending it to a file or subprocess.
10507
10508 If this does not specify a coding system, an appropriate element
10509 is used from one of the coding system alists.
10510 There are three such tables: `file-coding-system-alist',
10511 `process-coding-system-alist', and `network-coding-system-alist'.
10512 For output to files, if the above procedure does not specify a coding system,
10513 the value of `buffer-file-coding-system' is used.  */);
10514   Vcoding_system_for_write = Qnil;
10515
10516   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10517                doc: /*
10518 Coding system used in the latest file or process I/O.  */);
10519   Vlast_coding_system_used = Qnil;
10520
10521   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10522                doc: /*
10523 Error status of the last code conversion.
10524
10525 When an error was detected in the last code conversion, this variable
10526 is set to one of the following symbols.
10527   `insufficient-source'
10528   `inconsistent-eol'
10529   `invalid-source'
10530   `interrupted'
10531   `insufficient-memory'
10532 When no error was detected, the value doesn't change.  So, to check
10533 the error status of a code conversion by this variable, you must
10534 explicitly set this variable to nil before performing code
10535 conversion.  */);
10536   Vlast_code_conversion_error = Qnil;
10537
10538   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10539                doc: /*
10540 *Non-nil means always inhibit code conversion of end-of-line format.
10541 See info node `Coding Systems' and info node `Text and Binary' concerning
10542 such conversion.  */);
10543   inhibit_eol_conversion = 0;
10544
10545   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10546                doc: /*
10547 Non-nil means process buffer inherits coding system of process output.
10548 Bind it to t if the process output is to be treated as if it were a file
10549 read from some filesystem.  */);
10550   inherit_process_coding_system = 0;
10551
10552   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10553                doc: /*
10554 Alist to decide a coding system to use for a file I/O operation.
10555 The format is ((PATTERN . VAL) ...),
10556 where PATTERN is a regular expression matching a file name,
10557 VAL is a coding system, a cons of coding systems, or a function symbol.
10558 If VAL is a coding system, it is used for both decoding and encoding
10559 the file contents.
10560 If VAL is a cons of coding systems, the car part is used for decoding,
10561 and the cdr part is used for encoding.
10562 If VAL is a function symbol, the function must return a coding system
10563 or a cons of coding systems which are used as above.  The function is
10564 called with an argument that is a list of the arguments with which
10565 `find-operation-coding-system' was called.  If the function can't decide
10566 a coding system, it can return `undecided' so that the normal
10567 code-detection is performed.
10568
10569 See also the function `find-operation-coding-system'
10570 and the variable `auto-coding-alist'.  */);
10571   Vfile_coding_system_alist = Qnil;
10572
10573   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10574                doc: /*
10575 Alist to decide a coding system to use for a process I/O operation.
10576 The format is ((PATTERN . VAL) ...),
10577 where PATTERN is a regular expression matching a program name,
10578 VAL is a coding system, a cons of coding systems, or a function symbol.
10579 If VAL is a coding system, it is used for both decoding what received
10580 from the program and encoding what sent to the program.
10581 If VAL is a cons of coding systems, the car part is used for decoding,
10582 and the cdr part is used for encoding.
10583 If VAL is a function symbol, the function must return a coding system
10584 or a cons of coding systems which are used as above.
10585
10586 See also the function `find-operation-coding-system'.  */);
10587   Vprocess_coding_system_alist = Qnil;
10588
10589   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10590                doc: /*
10591 Alist to decide a coding system to use for a network I/O operation.
10592 The format is ((PATTERN . VAL) ...),
10593 where PATTERN is a regular expression matching a network service name
10594 or is a port number to connect to,
10595 VAL is a coding system, a cons of coding systems, or a function symbol.
10596 If VAL is a coding system, it is used for both decoding what received
10597 from the network stream and encoding what sent to the network stream.
10598 If VAL is a cons of coding systems, the car part is used for decoding,
10599 and the cdr part is used for encoding.
10600 If VAL is a function symbol, the function must return a coding system
10601 or a cons of coding systems which are used as above.
10602
10603 See also the function `find-operation-coding-system'.  */);
10604   Vnetwork_coding_system_alist = Qnil;
10605
10606   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10607                doc: /* Coding system to use with system messages.
10608 Also used for decoding keyboard input on X Window system.  */);
10609   Vlocale_coding_system = Qnil;
10610
10611   /* The eol mnemonics are reset in startup.el system-dependently.  */
10612   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10613                doc: /*
10614 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10615   eol_mnemonic_unix = build_pure_c_string (":");
10616
10617   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10618                doc: /*
10619 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10620   eol_mnemonic_dos = build_pure_c_string ("\\");
10621
10622   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10623                doc: /*
10624 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10625   eol_mnemonic_mac = build_pure_c_string ("/");
10626
10627   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10628                doc: /*
10629 *String displayed in mode line when end-of-line format is not yet determined.  */);
10630   eol_mnemonic_undecided = build_pure_c_string (":");
10631
10632   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10633                doc: /*
10634 *Non-nil enables character translation while encoding and decoding.  */);
10635   Venable_character_translation = Qt;
10636
10637   DEFVAR_LISP ("standard-translation-table-for-decode",
10638                Vstandard_translation_table_for_decode,
10639                doc: /* Table for translating characters while decoding.  */);
10640   Vstandard_translation_table_for_decode = Qnil;
10641
10642   DEFVAR_LISP ("standard-translation-table-for-encode",
10643                Vstandard_translation_table_for_encode,
10644                doc: /* Table for translating characters while encoding.  */);
10645   Vstandard_translation_table_for_encode = Qnil;
10646
10647   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10648                doc: /* Alist of charsets vs revision numbers.
10649 While encoding, if a charset (car part of an element) is found,
10650 designate it with the escape sequence identifying revision (cdr part
10651 of the element).  */);
10652   Vcharset_revision_table = Qnil;
10653
10654   DEFVAR_LISP ("default-process-coding-system",
10655                Vdefault_process_coding_system,
10656                doc: /* Cons of coding systems used for process I/O by default.
10657 The car part is used for decoding a process output,
10658 the cdr part is used for encoding a text to be sent to a process.  */);
10659   Vdefault_process_coding_system = Qnil;
10660
10661   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10662                doc: /*
10663 Table of extra Latin codes in the range 128..159 (inclusive).
10664 This is a vector of length 256.
10665 If Nth element is non-nil, the existence of code N in a file
10666 \(or output of subprocess) doesn't prevent it to be detected as
10667 a coding system of ISO 2022 variant which has a flag
10668 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10669 or reading output of a subprocess.
10670 Only 128th through 159th elements have a meaning.  */);
10671   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10672
10673   DEFVAR_LISP ("select-safe-coding-system-function",
10674                Vselect_safe_coding_system_function,
10675                doc: /*
10676 Function to call to select safe coding system for encoding a text.
10677
10678 If set, this function is called to force a user to select a proper
10679 coding system which can encode the text in the case that a default
10680 coding system used in each operation can't encode the text.  The
10681 function should take care that the buffer is not modified while
10682 the coding system is being selected.
10683
10684 The default value is `select-safe-coding-system' (which see).  */);
10685   Vselect_safe_coding_system_function = Qnil;
10686
10687   DEFVAR_BOOL ("coding-system-require-warning",
10688                coding_system_require_warning,
10689                doc: /* Internal use only.
10690 If non-nil, on writing a file, `select-safe-coding-system-function' is
10691 called even if `coding-system-for-write' is non-nil.  The command
10692 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10693   coding_system_require_warning = 0;
10694
10695
10696   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10697                inhibit_iso_escape_detection,
10698                doc: /*
10699 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10700
10701 When Emacs reads text, it tries to detect how the text is encoded.
10702 This code detection is sensitive to escape sequences.  If Emacs sees
10703 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10704 of the ISO2022 encodings, and decodes text by the corresponding coding
10705 system (e.g. `iso-2022-7bit').
10706
10707 However, there may be a case that you want to read escape sequences in
10708 a file as is.  In such a case, you can set this variable to non-nil.
10709 Then the code detection will ignore any escape sequences, and no text is
10710 detected as encoded in some ISO-2022 encoding.  The result is that all
10711 escape sequences become visible in a buffer.
10712
10713 The default value is nil, and it is strongly recommended not to change
10714 it.  That is because many Emacs Lisp source files that contain
10715 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10716 in Emacs's distribution, and they won't be decoded correctly on
10717 reading if you suppress escape sequence detection.
10718
10719 The other way to read escape sequences in a file without decoding is
10720 to explicitly specify some coding system that doesn't use ISO-2022
10721 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10722   inhibit_iso_escape_detection = 0;
10723
10724   DEFVAR_BOOL ("inhibit-null-byte-detection",
10725                inhibit_null_byte_detection,
10726                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10727 By default, Emacs treats it as binary data, and does not attempt to
10728 decode it.  The effect is as if you specified `no-conversion' for
10729 reading that text.
10730
10731 Set this to non-nil when a regular text happens to include null bytes.
10732 Examples are Index nodes of Info files and null-byte delimited output
10733 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10734 decode text as usual.  */);
10735   inhibit_null_byte_detection = 0;
10736
10737   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10738                doc: /* Char table for translating self-inserting characters.
10739 This is applied to the result of input methods, not their input.
10740 See also `keyboard-translate-table'.
10741
10742 Use of this variable for character code unification was rendered
10743 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10744 internal character representation.  */);
10745     Vtranslation_table_for_input = Qnil;
10746
10747   {
10748     Lisp_Object args[coding_arg_max];
10749     Lisp_Object plist[16];
10750     int i;
10751
10752     for (i = 0; i < coding_arg_max; i++)
10753       args[i] = Qnil;
10754
10755     plist[0] = intern_c_string (":name");
10756     plist[1] = args[coding_arg_name] = Qno_conversion;
10757     plist[2] = intern_c_string (":mnemonic");
10758     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10759     plist[4] = intern_c_string (":coding-type");
10760     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10761     plist[6] = intern_c_string (":ascii-compatible-p");
10762     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10763     plist[8] = intern_c_string (":default-char");
10764     plist[9] = args[coding_arg_default_char] = make_number (0);
10765     plist[10] = intern_c_string (":for-unibyte");
10766     plist[11] = args[coding_arg_for_unibyte] = Qt;
10767     plist[12] = intern_c_string (":docstring");
10768     plist[13] = build_pure_c_string ("Do no conversion.\n\
10769 \n\
10770 When you visit a file with this coding, the file is read into a\n\
10771 unibyte buffer as is, thus each byte of a file is treated as a\n\
10772 character.");
10773     plist[14] = intern_c_string (":eol-type");
10774     plist[15] = args[coding_arg_eol_type] = Qunix;
10775     args[coding_arg_plist] = Flist (16, plist);
10776     Fdefine_coding_system_internal (coding_arg_max, args);
10777
10778     plist[1] = args[coding_arg_name] = Qundecided;
10779     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10780     plist[5] = args[coding_arg_coding_type] = Qundecided;
10781     /* This is already set.
10782        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10783     plist[8] = intern_c_string (":charset-list");
10784     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10785     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10786     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10787     plist[15] = args[coding_arg_eol_type] = Qnil;
10788     args[coding_arg_plist] = Flist (16, plist);
10789     Fdefine_coding_system_internal (coding_arg_max, args);
10790   }
10791
10792   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10793
10794   {
10795     int i;
10796
10797     for (i = 0; i < coding_category_max; i++)
10798       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10799   }
10800 #if defined (DOS_NT)
10801   system_eol_type = Qdos;
10802 #else
10803   system_eol_type = Qunix;
10804 #endif
10805   staticpro (&system_eol_type);
10806 }
10807
10808 char *
10809 emacs_strerror (int error_number)
10810 {
10811   char *str;
10812
10813   synchronize_system_messages_locale ();
10814   str = strerror (error_number);
10815
10816   if (! NILP (Vlocale_coding_system))
10817     {
10818       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10819                                                       Vlocale_coding_system,
10820                                                       0);
10821       str = SSDATA (dec);
10822     }
10823
10824   return str;
10825 }
10826
10827 #endif /* emacs */