src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 497
 498 /* A character to be produced on output if encoding of the original
 499    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 501
 502 /* UTF-8 section */
 503 #define CODING_UTF_8_BOM(coding)        \
 504   ((coding)->spec.utf_8_bom)
 505
 506 /* UTF-16 section */
 507 #define CODING_UTF_16_BOM(coding)       \
 508   ((coding)->spec.utf_16.bom)
 509
 510 #define CODING_UTF_16_ENDIAN(coding)    \
 511   ((coding)->spec.utf_16.endian)
 512
 513 #define CODING_UTF_16_SURROGATE(coding) \
 514   ((coding)->spec.utf_16.surrogate)
 515
 516
 517 /* CCL section */
 518 #define CODING_CCL_DECODER(coding)      \
 519   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 520 #define CODING_CCL_ENCODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 522 #define CODING_CCL_VALIDS(coding)                                          \
 523   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 524
 525 /* Index for each coding category in `coding_categories' */
 526
 527 enum coding_category
 528   {
 529     coding_category_iso_7,
 530     coding_category_iso_7_tight,
 531     coding_category_iso_8_1,
 532     coding_category_iso_8_2,
 533     coding_category_iso_7_else,
 534     coding_category_iso_8_else,
 535     coding_category_utf_8_auto,
 536     coding_category_utf_8_nosig,
 537     coding_category_utf_8_sig,
 538     coding_category_utf_16_auto,
 539     coding_category_utf_16_be,
 540     coding_category_utf_16_le,
 541     coding_category_utf_16_be_nosig,
 542     coding_category_utf_16_le_nosig,
 543     coding_category_charset,
 544     coding_category_sjis,
 545     coding_category_big5,
 546     coding_category_ccl,
 547     coding_category_emacs_mule,
 548     /* All above are targets of code detection.  */
 549     coding_category_raw_text,
 550     coding_category_undecided,
 551     coding_category_max
 552   };
 553
 554 /* Definitions of flag bits used in detect_coding_XXXX.  */
 555 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 556 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 557 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 558 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 559 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 560 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 561 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 562 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 563 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 564 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 565 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 566 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 567 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 568 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 569 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 570 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 571 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 572 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 573 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 574 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 575
 576 /* This value is returned if detect_coding_mask () find nothing other
 577    than ASCII characters.  */
 578 #define CATEGORY_MASK_ANY               \
 579   (CATEGORY_MASK_ISO_7                  \
 580    | CATEGORY_MASK_ISO_7_TIGHT          \
 581    | CATEGORY_MASK_ISO_8_1              \
 582    | CATEGORY_MASK_ISO_8_2              \
 583    | CATEGORY_MASK_ISO_7_ELSE           \
 584    | CATEGORY_MASK_ISO_8_ELSE           \
 585    | CATEGORY_MASK_UTF_8_AUTO           \
 586    | CATEGORY_MASK_UTF_8_NOSIG          \
 587    | CATEGORY_MASK_UTF_8_SIG            \
 588    | CATEGORY_MASK_UTF_16_AUTO          \
 589    | CATEGORY_MASK_UTF_16_BE            \
 590    | CATEGORY_MASK_UTF_16_LE            \
 591    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 592    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 593    | CATEGORY_MASK_CHARSET              \
 594    | CATEGORY_MASK_SJIS                 \
 595    | CATEGORY_MASK_BIG5                 \
 596    | CATEGORY_MASK_CCL                  \
 597    | CATEGORY_MASK_EMACS_MULE)
 598
 599
 600 #define CATEGORY_MASK_ISO_7BIT \
 601   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 602
 603 #define CATEGORY_MASK_ISO_8BIT \
 604   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 605
 606 #define CATEGORY_MASK_ISO_ELSE \
 607   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 608
 609 #define CATEGORY_MASK_ISO_ESCAPE        \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_7_ELSE           \
 613    | CATEGORY_MASK_ISO_8_ELSE)
 614
 615 #define CATEGORY_MASK_ISO       \
 616   (  CATEGORY_MASK_ISO_7BIT     \
 617      | CATEGORY_MASK_ISO_8BIT   \
 618      | CATEGORY_MASK_ISO_ELSE)
 619
 620 #define CATEGORY_MASK_UTF_16            \
 621   (CATEGORY_MASK_UTF_16_AUTO            \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 626
 627 #define CATEGORY_MASK_UTF_8     \
 628   (CATEGORY_MASK_UTF_8_AUTO     \
 629    | CATEGORY_MASK_UTF_8_NOSIG  \
 630    | CATEGORY_MASK_UTF_8_SIG)
 631
 632 /* Table of coding categories (Lisp symbols).  This variable is for
 633    internal use only.  */
 634 static Lisp_Object Vcoding_category_table;
 635
 636 /* Table of coding-categories ordered by priority.  */
 637 static enum coding_category coding_priorities[coding_category_max];
 638
 639 /* Nth element is a coding context for the coding system bound to the
 640    Nth coding category.  */
 641 static struct coding_system coding_categories[coding_category_max];
 642
 643 /*** Commonly used macros and functions ***/
 644
 645 #ifndef min
 646 #define min(a, b) ((a) < (b) ? (a) : (b))
 647 #endif
 648 #ifndef max
 649 #define max(a, b) ((a) > (b) ? (a) : (b))
 650 #endif
 651
 652 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 653   do {                                                  \
 654     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 655     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 656   } while (0)
 657
 658
 659 /* Safely get one byte from the source text pointed by SRC which ends
 660    at SRC_END, and set C to that byte.  If there are not enough bytes
 661    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 662    and a multibyte character is found at SRC, set C to the
 663    negative value of the character code.  The caller should declare
 664    and set these variables appropriately in advance:
 665         src, src_end, multibytep */
 666
 667 #define ONE_MORE_BYTE(c)                                \
 668   do {                                                  \
 669     if (src == src_end)                                 \
 670       {                                                 \
 671         if (src_base < src)                             \
 672           record_conversion_result                      \
 673             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 674         goto no_more_source;                            \
 675       }                                                 \
 676     c = *src++;                                         \
 677     if (multibytep && (c & 0x80))                       \
 678       {                                                 \
 679         if ((c & 0xFE) == 0xC0)                         \
 680           c = ((c & 1) << 6) | *src++;                  \
 681         else                                            \
 682           {                                             \
 683             src--;                                      \
 684             c = - string_char (src, &src, NULL);        \
 685             record_conversion_result                    \
 686               (coding, CODING_RESULT_INVALID_SRC);      \
 687           }                                             \
 688       }                                                 \
 689     consumed_chars++;                                   \
 690   } while (0)
 691
 692 /* Safely get two bytes from the source text pointed by SRC which ends
 693    at SRC_END, and set C1 and C2 to those bytes while skipping the
 694    heading multibyte characters.  If there are not enough bytes in the
 695    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 696    a multibyte character is found for C2, set C2 to the negative value
 697    of the character code.  The caller should declare and set these
 698    variables appropriately in advance:
 699         src, src_end, multibytep
 700    It is intended that this macro is used in detect_coding_utf_16.  */
 701
 702 #define TWO_MORE_BYTES(c1, c2)                          \
 703   do {                                                  \
 704     do {                                                \
 705       if (src == src_end)                               \
 706         goto no_more_source;                            \
 707       c1 = *src++;                                      \
 708       if (multibytep && (c1 & 0x80))                    \
 709         {                                               \
 710           if ((c1 & 0xFE) == 0xC0)                      \
 711             c1 = ((c1 & 1) << 6) | *src++;              \
 712           else                                          \
 713             {                                           \
 714               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 715               c1 = -1;                                  \
 716             }                                           \
 717         }                                               \
 718     } while (c1 < 0);                                   \
 719     if (src == src_end)                                 \
 720       goto no_more_source;                              \
 721     c2 = *src++;                                        \
 722     if (multibytep && (c2 & 0x80))                      \
 723       {                                                 \
 724         if ((c2 & 0xFE) == 0xC0)                        \
 725           c2 = ((c2 & 1) << 6) | *src++;                \
 726         else                                            \
 727           c2 = -1;                                      \
 728       }                                                 \
 729   } while (0)
 730
 731
 732 /* Store a byte C in the place pointed by DST and increment DST to the
 733    next free point, and increment PRODUCED_CHARS.  The caller should
 734    assure that C is 0..127, and declare and set the variable `dst'
 735    appropriately in advance.
 736 */
 737
 738
 739 #define EMIT_ONE_ASCII_BYTE(c)  \
 740   do {                          \
 741     produced_chars++;           \
 742     *dst++ = (c);               \
 743   } while (0)
 744
 745
 746 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 747
 748 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 749   do {                                  \
 750     produced_chars += 2;                \
 751     *dst++ = (c1), *dst++ = (c2);       \
 752   } while (0)
 753
 754
 755 /* Store a byte C in the place pointed by DST and increment DST to the
 756    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 757    store in an appropriate multibyte form.  The caller should
 758    declare and set the variables `dst' and `multibytep' appropriately
 759    in advance.  */
 760
 761 #define EMIT_ONE_BYTE(c)                \
 762   do {                                  \
 763     produced_chars++;                   \
 764     if (multibytep)                     \
 765       {                                 \
 766         unsigned ch = (c);              \
 767         if (ch >= 0x80)                 \
 768           ch = BYTE8_TO_CHAR (ch);      \
 769         CHAR_STRING_ADVANCE (ch, dst);  \
 770       }                                 \
 771     else                                \
 772       *dst++ = (c);                     \
 773   } while (0)
 774
 775
 776 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 777
 778 #define EMIT_TWO_BYTES(c1, c2)          \
 779   do {                                  \
 780     produced_chars += 2;                \
 781     if (multibytep)                     \
 782       {                                 \
 783         unsigned ch;                    \
 784                                         \
 785         ch = (c1);                      \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789         ch = (c2);                      \
 790         if (ch >= 0x80)                 \
 791           ch = BYTE8_TO_CHAR (ch);      \
 792         CHAR_STRING_ADVANCE (ch, dst);  \
 793       }                                 \
 794     else                                \
 795       {                                 \
 796         *dst++ = (c1);                  \
 797         *dst++ = (c2);                  \
 798       }                                 \
 799   } while (0)
 800
 801
 802 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 803   do {                                  \
 804     EMIT_ONE_BYTE (c1);                 \
 805     EMIT_TWO_BYTES (c2, c3);            \
 806   } while (0)
 807
 808
 809 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 810   do {                                          \
 811     EMIT_TWO_BYTES (c1, c2);                    \
 812     EMIT_TWO_BYTES (c3, c4);                    \
 813   } while (0)
 814
 815
 816 static void
 817 record_conversion_result (struct coding_system *coding,
 818                           enum coding_result_code result)
 819 {
 820   coding->result = result;
 821   switch (result)
 822     {
 823     case CODING_RESULT_INSUFFICIENT_SRC:
 824       Vlast_code_conversion_error = Qinsufficient_source;
 825       break;
 826     case CODING_RESULT_INVALID_SRC:
 827       Vlast_code_conversion_error = Qinvalid_source;
 828       break;
 829     case CODING_RESULT_INTERRUPT:
 830       Vlast_code_conversion_error = Qinterrupted;
 831       break;
 832     case CODING_RESULT_INSUFFICIENT_DST:
 833       /* Don't record this error in Vlast_code_conversion_error
 834          because it happens just temporarily and is resolved when the
 835          whole conversion is finished.  */
 836       break;
 837     case CODING_RESULT_SUCCESS:
 838       break;
 839     default:
 840       Vlast_code_conversion_error = intern ("Unknown error");
 841     }
 842 }
 843
 844 /* These wrapper macros are used to preserve validity of pointers into
 845    buffer text across calls to decode_char, encode_char, etc, which
 846    could cause relocation of buffers if it loads a charset map,
 847    because loading a charset map allocates large structures.  */
 848
 849 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 850   do {                                                                       \
 851     ptrdiff_t offset;                                                        \
 852                                                                              \
 853     charset_map_loaded = 0;                                                  \
 854     c = DECODE_CHAR (charset, code);                                         \
 855     if (charset_map_loaded                                                   \
 856         && (offset = coding_change_source (coding)))                         \
 857       {                                                                      \
 858         src += offset;                                                       \
 859         src_base += offset;                                                  \
 860         src_end += offset;                                                   \
 861       }                                                                      \
 862   } while (0)
 863
 864 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 865   do {                                                                  \
 866     ptrdiff_t offset;                                                   \
 867                                                                         \
 868     charset_map_loaded = 0;                                             \
 869     code = ENCODE_CHAR (charset, c);                                    \
 870     if (charset_map_loaded                                              \
 871         && (offset = coding_change_destination (coding)))               \
 872       {                                                                 \
 873         dst += offset;                                                  \
 874         dst_end += offset;                                              \
 875       }                                                                 \
 876   } while (0)
 877
 878 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 879   do {                                                                  \
 880     ptrdiff_t offset;                                                   \
 881                                                                         \
 882     charset_map_loaded = 0;                                             \
 883     charset = char_charset (c, charset_list, code_return);              \
 884     if (charset_map_loaded                                              \
 885         && (offset = coding_change_destination (coding)))               \
 886       {                                                                 \
 887         dst += offset;                                                  \
 888         dst_end += offset;                                              \
 889       }                                                                 \
 890   } while (0)
 891
 892 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 893   do {                                                                  \
 894     ptrdiff_t offset;                                                   \
 895                                                                         \
 896     charset_map_loaded = 0;                                             \
 897     result = CHAR_CHARSET_P (c, charset);                               \
 898     if (charset_map_loaded                                              \
 899         && (offset = coding_change_destination (coding)))               \
 900       {                                                                 \
 901         dst += offset;                                                  \
 902         dst_end += offset;                                              \
 903       }                                                                 \
 904   } while (0)
 905
 906
 907 /* If there are at least BYTES length of room at dst, allocate memory
 908    for coding->destination and update dst and dst_end.  We don't have
 909    to take care of coding->source which will be relocated.  It is
 910    handled by calling coding_set_source in encode_coding.  */
 911
 912 #define ASSURE_DESTINATION(bytes)                               \
 913   do {                                                          \
 914     if (dst + (bytes) >= dst_end)                               \
 915       {                                                         \
 916         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 917                                                                 \
 918         dst = alloc_destination (coding, more_bytes, dst);      \
 919         dst_end = coding->destination + coding->dst_bytes;      \
 920       }                                                         \
 921   } while (0)
 922
 923
 924 /* Store multibyte form of the character C in P, and advance P to the
 925    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 926    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 927    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 928
 929 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 930
 931 /* Return the character code of character whose multibyte form is at
 932    P, and advance P to the end of the multibyte form.  This used to be
 933    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 934    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 935
 936 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 937
 938 /* Set coding->source from coding->src_object.  */
 939
 940 static void
 941 coding_set_source (struct coding_system *coding)
 942 {
 943   if (BUFFERP (coding->src_object))
 944     {
 945       struct buffer *buf = XBUFFER (coding->src_object);
 946
 947       if (coding->src_pos < 0)
 948         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 949       else
 950         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 951     }
 952   else if (STRINGP (coding->src_object))
 953     {
 954       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 955     }
 956   else
 957     {
 958       /* Otherwise, the source is C string and is never relocated
 959          automatically.  Thus we don't have to update anything.  */
 960     }
 961 }
 962
 963
 964 /* Set coding->source from coding->src_object, and return how many
 965    bytes coding->source was changed.  */
 966
 967 static ptrdiff_t
 968 coding_change_source (struct coding_system *coding)
 969 {
 970   const unsigned char *orig = coding->source;
 971   coding_set_source (coding);
 972   return coding->source - orig;
 973 }
 974
 975
 976 /* Set coding->destination from coding->dst_object.  */
 977
 978 static void
 979 coding_set_destination (struct coding_system *coding)
 980 {
 981   if (BUFFERP (coding->dst_object))
 982     {
 983       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 984         {
 985           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 986           coding->dst_bytes = (GAP_END_ADDR
 987                                - (coding->src_bytes - coding->consumed)
 988                                - coding->destination);
 989         }
 990       else
 991         {
 992           /* We are sure that coding->dst_pos_byte is before the gap
 993              of the buffer. */
 994           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 995                                  + coding->dst_pos_byte - BEG_BYTE);
 996           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 997                                - coding->destination);
 998         }
 999     }
1000   else
1001     {
1002       /* Otherwise, the destination is C string and is never relocated
1003          automatically.  Thus we don't have to update anything.  */
1004     }
1005 }
1006
1007
1008 /* Set coding->destination from coding->dst_object, and return how
1009    many bytes coding->destination was changed.  */
1010
1011 static ptrdiff_t
1012 coding_change_destination (struct coding_system *coding)
1013 {
1014   const unsigned char *orig = coding->destination;
1015   coding_set_destination (coding);
1016   return coding->destination - orig;
1017 }
1018
1019
1020 static void
1021 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1022 {
1023   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1024     string_overflow ();
1025   coding->destination = xrealloc (coding->destination,
1026                                   coding->dst_bytes + bytes);
1027   coding->dst_bytes += bytes;
1028 }
1029
1030 static void
1031 coding_alloc_by_making_gap (struct coding_system *coding,
1032                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1033 {
1034   if (EQ (coding->src_object, coding->dst_object))
1035     {
1036       /* The gap may contain the produced data at the head and not-yet
1037          consumed data at the tail.  To preserve those data, we at
1038          first make the gap size to zero, then increase the gap
1039          size.  */
1040       ptrdiff_t add = GAP_SIZE;
1041
1042       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1043       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1044       make_gap (bytes);
1045       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1046       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1047     }
1048   else
1049     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1050 }
1051
1052
1053 static unsigned char *
1054 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1055                    unsigned char *dst)
1056 {
1057   ptrdiff_t offset = dst - coding->destination;
1058
1059   if (BUFFERP (coding->dst_object))
1060     {
1061       struct buffer *buf = XBUFFER (coding->dst_object);
1062
1063       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1064     }
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   coding_set_destination (coding);
1068   dst = coding->destination + offset;
1069   return dst;
1070 }
1071
1072 /** Macros for annotations.  */
1073
1074 /* An annotation data is stored in the array coding->charbuf in this
1075    format:
1076      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1077    LENGTH is the number of elements in the annotation.
1078    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1079    NCHARS is the number of characters in the text annotated.
1080
1081    The format of the following elements depend on ANNOTATION_MASK.
1082
1083    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1084    follows:
1085      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1086
1087    NBYTES is the number of bytes specified in the header part of
1088    old-style emacs-mule encoding, or 0 for the other kind of
1089    composition.
1090
1091    METHOD is one of enum composition_method.
1092
1093    Optional COMPOSITION-COMPONENTS are characters and composition
1094    rules.
1095
1096    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1097    follows.
1098
1099    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1100    recover from an invalid annotation, and should be skipped by
1101    produce_annotation.  */
1102
1103 /* Maximum length of the header of annotation data.  */
1104 #define MAX_ANNOTATION_LENGTH 5
1105
1106 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1107   do {                                                  \
1108     *(buf)++ = -(len);                                  \
1109     *(buf)++ = (mask);                                  \
1110     *(buf)++ = (nchars);                                \
1111     coding->annotated = 1;                              \
1112   } while (0);
1113
1114 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1115   do {                                                                      \
1116     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1117     *buf++ = nbytes;                                                        \
1118     *buf++ = method;                                                        \
1119   } while (0)
1120
1121
1122 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1123   do {                                                                  \
1124     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1125     *buf++ = id;                                                        \
1126   } while (0)
1127
1128 \f
1129 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1130
1131
1132
1133 \f
1134 /*** 3. UTF-8 ***/
1135
1136 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1137    Return true if a text is encoded in UTF-8.  */
1138
1139 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1140 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1141 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1142 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1143 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1144 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1145
1146 #define UTF_8_BOM_1 0xEF
1147 #define UTF_8_BOM_2 0xBB
1148 #define UTF_8_BOM_3 0xBF
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   bool found = 0;
1160
1161   detect_info->checked |= CATEGORY_MASK_UTF_8;
1162   /* A coding system of this category is always ASCII compatible.  */
1163   src += coding->head_ascii;
1164
1165   while (1)
1166     {
1167       int c, c1, c2, c3, c4;
1168
1169       src_base = src;
1170       ONE_MORE_BYTE (c);
1171       if (c < 0 || UTF_8_1_OCTET_P (c))
1172         continue;
1173       ONE_MORE_BYTE (c1);
1174       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1175         break;
1176       if (UTF_8_2_OCTET_LEADING_P (c))
1177         {
1178           found = 1;
1179           continue;
1180         }
1181       ONE_MORE_BYTE (c2);
1182       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1183         break;
1184       if (UTF_8_3_OCTET_LEADING_P (c))
1185         {
1186           found = 1;
1187           if (src_base == coding->source
1188               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1189             bom_found = 1;
1190           continue;
1191         }
1192       ONE_MORE_BYTE (c3);
1193       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1194         break;
1195       if (UTF_8_4_OCTET_LEADING_P (c))
1196         {
1197           found = 1;
1198           continue;
1199         }
1200       ONE_MORE_BYTE (c4);
1201       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1202         break;
1203       if (UTF_8_5_OCTET_LEADING_P (c))
1204         {
1205           found = 1;
1206           continue;
1207         }
1208       break;
1209     }
1210   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1211   return 0;
1212
1213  no_more_source:
1214   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1215     {
1216       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1217       return 0;
1218     }
1219   if (bom_found)
1220     {
1221       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1222       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1223     }
1224   else
1225     {
1226       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1227       if (found)
1228         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1229     }
1230   return 1;
1231 }
1232
1233
1234 static void
1235 decode_coding_utf_8 (struct coding_system *coding)
1236 {
1237   const unsigned char *src = coding->source + coding->consumed;
1238   const unsigned char *src_end = coding->source + coding->src_bytes;
1239   const unsigned char *src_base;
1240   int *charbuf = coding->charbuf + coding->charbuf_used;
1241   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1242   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1243   bool multibytep = coding->src_multibyte;
1244   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1245   bool eol_dos
1246     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1247   int byte_after_cr = -1;
1248
1249   if (bom != utf_without_bom)
1250     {
1251       int c1, c2, c3;
1252
1253       src_base = src;
1254       ONE_MORE_BYTE (c1);
1255       if (! UTF_8_3_OCTET_LEADING_P (c1))
1256         src = src_base;
1257       else
1258         {
1259           ONE_MORE_BYTE (c2);
1260           if (! UTF_8_EXTRA_OCTET_P (c2))
1261             src = src_base;
1262           else
1263             {
1264               ONE_MORE_BYTE (c3);
1265               if (! UTF_8_EXTRA_OCTET_P (c3))
1266                 src = src_base;
1267               else
1268                 {
1269                   if ((c1 != UTF_8_BOM_1)
1270                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1271                     src = src_base;
1272                   else
1273                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1274                 }
1275             }
1276         }
1277     }
1278   CODING_UTF_8_BOM (coding) = utf_without_bom;
1279
1280   while (1)
1281     {
1282       int c, c1, c2, c3, c4, c5;
1283
1284       src_base = src;
1285       consumed_chars_base = consumed_chars;
1286
1287       if (charbuf >= charbuf_end)
1288         {
1289           if (byte_after_cr >= 0)
1290             src_base--;
1291           break;
1292         }
1293
1294       if (byte_after_cr >= 0)
1295         c1 = byte_after_cr, byte_after_cr = -1;
1296       else
1297         ONE_MORE_BYTE (c1);
1298       if (c1 < 0)
1299         {
1300           c = - c1;
1301         }
1302       else if (UTF_8_1_OCTET_P (c1))
1303         {
1304           if (eol_dos && c1 == '\r')
1305             ONE_MORE_BYTE (byte_after_cr);
1306           c = c1;
1307         }
1308       else
1309         {
1310           ONE_MORE_BYTE (c2);
1311           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1312             goto invalid_code;
1313           if (UTF_8_2_OCTET_LEADING_P (c1))
1314             {
1315               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1316               /* Reject overlong sequences here and below.  Encoders
1317                  producing them are incorrect, they can be misleading,
1318                  and they mess up read/write invariance.  */
1319               if (c < 128)
1320                 goto invalid_code;
1321             }
1322           else
1323             {
1324               ONE_MORE_BYTE (c3);
1325               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1326                 goto invalid_code;
1327               if (UTF_8_3_OCTET_LEADING_P (c1))
1328                 {
1329                   c = (((c1 & 0xF) << 12)
1330                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1331                   if (c < 0x800
1332                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1333                     goto invalid_code;
1334                 }
1335               else
1336                 {
1337                   ONE_MORE_BYTE (c4);
1338                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1339                     goto invalid_code;
1340                   if (UTF_8_4_OCTET_LEADING_P (c1))
1341                     {
1342                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1343                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1344                     if (c < 0x10000)
1345                       goto invalid_code;
1346                     }
1347                   else
1348                     {
1349                       ONE_MORE_BYTE (c5);
1350                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1351                         goto invalid_code;
1352                       if (UTF_8_5_OCTET_LEADING_P (c1))
1353                         {
1354                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1355                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1356                                | (c5 & 0x3F));
1357                           if ((c > MAX_CHAR) || (c < 0x200000))
1358                             goto invalid_code;
1359                         }
1360                       else
1361                         goto invalid_code;
1362                     }
1363                 }
1364             }
1365         }
1366
1367       *charbuf++ = c;
1368       continue;
1369
1370     invalid_code:
1371       src = src_base;
1372       consumed_chars = consumed_chars_base;
1373       ONE_MORE_BYTE (c);
1374       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1375       coding->errors++;
1376     }
1377
1378  no_more_source:
1379   coding->consumed_char += consumed_chars_base;
1380   coding->consumed = src_base - coding->source;
1381   coding->charbuf_used = charbuf - coding->charbuf;
1382 }
1383
1384
1385 static bool
1386 encode_coding_utf_8 (struct coding_system *coding)
1387 {
1388   bool multibytep = coding->dst_multibyte;
1389   int *charbuf = coding->charbuf;
1390   int *charbuf_end = charbuf + coding->charbuf_used;
1391   unsigned char *dst = coding->destination + coding->produced;
1392   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1393   ptrdiff_t produced_chars = 0;
1394   int c;
1395
1396   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1397     {
1398       ASSURE_DESTINATION (3);
1399       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1400       CODING_UTF_8_BOM (coding) = utf_without_bom;
1401     }
1402
1403   if (multibytep)
1404     {
1405       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1406
1407       while (charbuf < charbuf_end)
1408         {
1409           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1410
1411           ASSURE_DESTINATION (safe_room);
1412           c = *charbuf++;
1413           if (CHAR_BYTE8_P (c))
1414             {
1415               c = CHAR_TO_BYTE8 (c);
1416               EMIT_ONE_BYTE (c);
1417             }
1418           else
1419             {
1420               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1421               for (p = str; p < pend; p++)
1422                 EMIT_ONE_BYTE (*p);
1423             }
1424         }
1425     }
1426   else
1427     {
1428       int safe_room = MAX_MULTIBYTE_LENGTH;
1429
1430       while (charbuf < charbuf_end)
1431         {
1432           ASSURE_DESTINATION (safe_room);
1433           c = *charbuf++;
1434           if (CHAR_BYTE8_P (c))
1435             *dst++ = CHAR_TO_BYTE8 (c);
1436           else
1437             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1438           produced_chars++;
1439         }
1440     }
1441   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1442   coding->produced_char += produced_chars;
1443   coding->produced = dst - coding->destination;
1444   return 0;
1445 }
1446
1447
1448 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1449    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1450
1451 #define UTF_16_HIGH_SURROGATE_P(val) \
1452   (((val) & 0xFC00) == 0xD800)
1453
1454 #define UTF_16_LOW_SURROGATE_P(val) \
1455   (((val) & 0xFC00) == 0xDC00)
1456
1457
1458 static bool
1459 detect_coding_utf_16 (struct coding_system *coding,
1460                       struct coding_detection_info *detect_info)
1461 {
1462   const unsigned char *src = coding->source;
1463   const unsigned char *src_end = coding->source + coding->src_bytes;
1464   bool multibytep = coding->src_multibyte;
1465   int c1, c2;
1466
1467   detect_info->checked |= CATEGORY_MASK_UTF_16;
1468   if (coding->mode & CODING_MODE_LAST_BLOCK
1469       && (coding->src_chars & 1))
1470     {
1471       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1472       return 0;
1473     }
1474
1475   TWO_MORE_BYTES (c1, c2);
1476   if ((c1 == 0xFF) && (c2 == 0xFE))
1477     {
1478       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1479                              | CATEGORY_MASK_UTF_16_AUTO);
1480       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1481                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1482                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1483     }
1484   else if ((c1 == 0xFE) && (c2 == 0xFF))
1485     {
1486       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1487                              | CATEGORY_MASK_UTF_16_AUTO);
1488       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1489                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1490                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1491     }
1492   else if (c2 < 0)
1493     {
1494       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1495       return 0;
1496     }
1497   else
1498     {
1499       /* We check the dispersion of Eth and Oth bytes where E is even and
1500          O is odd.  If both are high, we assume binary data.*/
1501       unsigned char e[256], o[256];
1502       unsigned e_num = 1, o_num = 1;
1503
1504       memset (e, 0, 256);
1505       memset (o, 0, 256);
1506       e[c1] = 1;
1507       o[c2] = 1;
1508
1509       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1510                                 |CATEGORY_MASK_UTF_16_BE
1511                                 | CATEGORY_MASK_UTF_16_LE);
1512
1513       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1514              != CATEGORY_MASK_UTF_16)
1515         {
1516           TWO_MORE_BYTES (c1, c2);
1517           if (c2 < 0)
1518             break;
1519           if (! e[c1])
1520             {
1521               e[c1] = 1;
1522               e_num++;
1523               if (e_num >= 128)
1524                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1525             }
1526           if (! o[c2])
1527             {
1528               o[c2] = 1;
1529               o_num++;
1530               if (o_num >= 128)
1531                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1532             }
1533         }
1534       return 0;
1535     }
1536
1537  no_more_source:
1538   return 1;
1539 }
1540
1541 static void
1542 decode_coding_utf_16 (struct coding_system *coding)
1543 {
1544   const unsigned char *src = coding->source + coding->consumed;
1545   const unsigned char *src_end = coding->source + coding->src_bytes;
1546   const unsigned char *src_base;
1547   int *charbuf = coding->charbuf + coding->charbuf_used;
1548   /* We may produces at most 3 chars in one loop.  */
1549   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1550   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1551   bool multibytep = coding->src_multibyte;
1552   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1553   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1554   int surrogate = CODING_UTF_16_SURROGATE (coding);
1555   bool eol_dos
1556     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1557   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1558
1559   if (bom == utf_with_bom)
1560     {
1561       int c, c1, c2;
1562
1563       src_base = src;
1564       ONE_MORE_BYTE (c1);
1565       ONE_MORE_BYTE (c2);
1566       c = (c1 << 8) | c2;
1567
1568       if (endian == utf_16_big_endian
1569           ? c != 0xFEFF : c != 0xFFFE)
1570         {
1571           /* The first two bytes are not BOM.  Treat them as bytes
1572              for a normal character.  */
1573           src = src_base;
1574           coding->errors++;
1575         }
1576       CODING_UTF_16_BOM (coding) = utf_without_bom;
1577     }
1578   else if (bom == utf_detect_bom)
1579     {
1580       /* We have already tried to detect BOM and failed in
1581          detect_coding.  */
1582       CODING_UTF_16_BOM (coding) = utf_without_bom;
1583     }
1584
1585   while (1)
1586     {
1587       int c, c1, c2;
1588
1589       src_base = src;
1590       consumed_chars_base = consumed_chars;
1591
1592       if (charbuf >= charbuf_end)
1593         {
1594           if (byte_after_cr1 >= 0)
1595             src_base -= 2;
1596           break;
1597         }
1598
1599       if (byte_after_cr1 >= 0)
1600         c1 = byte_after_cr1, byte_after_cr1 = -1;
1601       else
1602         ONE_MORE_BYTE (c1);
1603       if (c1 < 0)
1604         {
1605           *charbuf++ = -c1;
1606           continue;
1607         }
1608       if (byte_after_cr2 >= 0)
1609         c2 = byte_after_cr2, byte_after_cr2 = -1;
1610       else
1611         ONE_MORE_BYTE (c2);
1612       if (c2 < 0)
1613         {
1614           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1615           *charbuf++ = -c2;
1616           continue;
1617         }
1618       c = (endian == utf_16_big_endian
1619            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1620
1621       if (surrogate)
1622         {
1623           if (! UTF_16_LOW_SURROGATE_P (c))
1624             {
1625               if (endian == utf_16_big_endian)
1626                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1627               else
1628                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1629               *charbuf++ = c1;
1630               *charbuf++ = c2;
1631               coding->errors++;
1632               if (UTF_16_HIGH_SURROGATE_P (c))
1633                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1634               else
1635                 *charbuf++ = c;
1636             }
1637           else
1638             {
1639               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1640               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1641               *charbuf++ = 0x10000 + c;
1642             }
1643         }
1644       else
1645         {
1646           if (UTF_16_HIGH_SURROGATE_P (c))
1647             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1648           else
1649             {
1650               if (eol_dos && c == '\r')
1651                 {
1652                   ONE_MORE_BYTE (byte_after_cr1);
1653                   ONE_MORE_BYTE (byte_after_cr2);
1654                 }
1655               *charbuf++ = c;
1656             }
1657         }
1658     }
1659
1660  no_more_source:
1661   coding->consumed_char += consumed_chars_base;
1662   coding->consumed = src_base - coding->source;
1663   coding->charbuf_used = charbuf - coding->charbuf;
1664 }
1665
1666 static bool
1667 encode_coding_utf_16 (struct coding_system *coding)
1668 {
1669   bool multibytep = coding->dst_multibyte;
1670   int *charbuf = coding->charbuf;
1671   int *charbuf_end = charbuf + coding->charbuf_used;
1672   unsigned char *dst = coding->destination + coding->produced;
1673   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1674   int safe_room = 8;
1675   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1676   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1677   ptrdiff_t produced_chars = 0;
1678   int c;
1679
1680   if (bom != utf_without_bom)
1681     {
1682       ASSURE_DESTINATION (safe_room);
1683       if (big_endian)
1684         EMIT_TWO_BYTES (0xFE, 0xFF);
1685       else
1686         EMIT_TWO_BYTES (0xFF, 0xFE);
1687       CODING_UTF_16_BOM (coding) = utf_without_bom;
1688     }
1689
1690   while (charbuf < charbuf_end)
1691     {
1692       ASSURE_DESTINATION (safe_room);
1693       c = *charbuf++;
1694       if (c > MAX_UNICODE_CHAR)
1695         c = coding->default_char;
1696
1697       if (c < 0x10000)
1698         {
1699           if (big_endian)
1700             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1701           else
1702             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1703         }
1704       else
1705         {
1706           int c1, c2;
1707
1708           c -= 0x10000;
1709           c1 = (c >> 10) + 0xD800;
1710           c2 = (c & 0x3FF) + 0xDC00;
1711           if (big_endian)
1712             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1713           else
1714             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1715         }
1716     }
1717   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1718   coding->produced = dst - coding->destination;
1719   coding->produced_char += produced_chars;
1720   return 0;
1721 }
1722
1723 \f
1724 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1725
1726 /* Emacs' internal format for representation of multiple character
1727    sets is a kind of multi-byte encoding, i.e. characters are
1728    represented by variable-length sequences of one-byte codes.
1729
1730    ASCII characters and control characters (e.g. `tab', `newline') are
1731    represented by one-byte sequences which are their ASCII codes, in
1732    the range 0x00 through 0x7F.
1733
1734    8-bit characters of the range 0x80..0x9F are represented by
1735    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1736    code + 0x20).
1737
1738    8-bit characters of the range 0xA0..0xFF are represented by
1739    one-byte sequences which are their 8-bit code.
1740
1741    The other characters are represented by a sequence of `base
1742    leading-code', optional `extended leading-code', and one or two
1743    `position-code's.  The length of the sequence is determined by the
1744    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1745    whereas extended leading-code and position-code take the range 0xA0
1746    through 0xFF.  See `charset.h' for more details about leading-code
1747    and position-code.
1748
1749    --- CODE RANGE of Emacs' internal format ---
1750    character set        range
1751    -------------        -----
1752    ascii                0x00..0x7F
1753    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1754    eight-bit-graphic    0xA0..0xBF
1755    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1756    ---------------------------------------------
1757
1758    As this is the internal character representation, the format is
1759    usually not used externally (i.e. in a file or in a data sent to a
1760    process).  But, it is possible to have a text externally in this
1761    format (i.e. by encoding by the coding system `emacs-mule').
1762
1763    In that case, a sequence of one-byte codes has a slightly different
1764    form.
1765
1766    At first, all characters in eight-bit-control are represented by
1767    one-byte sequences which are their 8-bit code.
1768
1769    Next, character composition data are represented by the byte
1770    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1771    where,
1772         METHOD is 0xF2 plus one of composition method (enum
1773         composition_method),
1774
1775         BYTES is 0xA0 plus a byte length of this composition data,
1776
1777         CHARS is 0xA0 plus a number of characters composed by this
1778         data,
1779
1780         COMPONENTs are characters of multibyte form or composition
1781         rules encoded by two-byte of ASCII codes.
1782
1783    In addition, for backward compatibility, the following formats are
1784    also recognized as composition data on decoding.
1785
1786    0x80 MSEQ ...
1787    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1788
1789    Here,
1790         MSEQ is a multibyte form but in these special format:
1791           ASCII: 0xA0 ASCII_CODE+0x80,
1792           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1793         RULE is a one byte code of the range 0xA0..0xF0 that
1794         represents a composition rule.
1795   */
1796
1797 char emacs_mule_bytes[256];
1798
1799
1800 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1801    Return true if a text is encoded in 'emacs-mule'.  */
1802
1803 static bool
1804 detect_coding_emacs_mule (struct coding_system *coding,
1805                           struct coding_detection_info *detect_info)
1806 {
1807   const unsigned char *src = coding->source, *src_base;
1808   const unsigned char *src_end = coding->source + coding->src_bytes;
1809   bool multibytep = coding->src_multibyte;
1810   ptrdiff_t consumed_chars = 0;
1811   int c;
1812   int found = 0;
1813
1814   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1815   /* A coding system of this category is always ASCII compatible.  */
1816   src += coding->head_ascii;
1817
1818   while (1)
1819     {
1820       src_base = src;
1821       ONE_MORE_BYTE (c);
1822       if (c < 0)
1823         continue;
1824       if (c == 0x80)
1825         {
1826           /* Perhaps the start of composite character.  We simply skip
1827              it because analyzing it is too heavy for detecting.  But,
1828              at least, we check that the composite character
1829              constitutes of more than 4 bytes.  */
1830           const unsigned char *src_start;
1831
1832         repeat:
1833           src_start = src;
1834           do
1835             {
1836               ONE_MORE_BYTE (c);
1837             }
1838           while (c >= 0xA0);
1839
1840           if (src - src_start <= 4)
1841             break;
1842           found = CATEGORY_MASK_EMACS_MULE;
1843           if (c == 0x80)
1844             goto repeat;
1845         }
1846
1847       if (c < 0x80)
1848         {
1849           if (c < 0x20
1850               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1851             break;
1852         }
1853       else
1854         {
1855           int more_bytes = emacs_mule_bytes[c] - 1;
1856
1857           while (more_bytes > 0)
1858             {
1859               ONE_MORE_BYTE (c);
1860               if (c < 0xA0)
1861                 {
1862                   src--;        /* Unread the last byte.  */
1863                   break;
1864                 }
1865               more_bytes--;
1866             }
1867           if (more_bytes != 0)
1868             break;
1869           found = CATEGORY_MASK_EMACS_MULE;
1870         }
1871     }
1872   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1873   return 0;
1874
1875  no_more_source:
1876   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1877     {
1878       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1879       return 0;
1880     }
1881   detect_info->found |= found;
1882   return 1;
1883 }
1884
1885
1886 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1887    character.  If CMP_STATUS indicates that we must expect MSEQ or
1888    RULE described above, decode it and return the negative value of
1889    the decoded character or rule.  If an invalid byte is found, return
1890    -1.  If SRC is too short, return -2.  */
1891
1892 static int
1893 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1894                  int *nbytes, int *nchars, int *id,
1895                  struct composition_status *cmp_status)
1896 {
1897   const unsigned char *src_end = coding->source + coding->src_bytes;
1898   const unsigned char *src_base = src;
1899   bool multibytep = coding->src_multibyte;
1900   int charset_ID;
1901   unsigned code;
1902   int c;
1903   int consumed_chars = 0;
1904   bool mseq_found = 0;
1905
1906   ONE_MORE_BYTE (c);
1907   if (c < 0)
1908     {
1909       c = -c;
1910       charset_ID = emacs_mule_charset[0];
1911     }
1912   else
1913     {
1914       if (c >= 0xA0)
1915         {
1916           if (cmp_status->state != COMPOSING_NO
1917               && cmp_status->old_form)
1918             {
1919               if (cmp_status->state == COMPOSING_CHAR)
1920                 {
1921                   if (c == 0xA0)
1922                     {
1923                       ONE_MORE_BYTE (c);
1924                       c -= 0x80;
1925                       if (c < 0)
1926                         goto invalid_code;
1927                     }
1928                   else
1929                     c -= 0x20;
1930                   mseq_found = 1;
1931                 }
1932               else
1933                 {
1934                   *nbytes = src - src_base;
1935                   *nchars = consumed_chars;
1936                   return -c;
1937                 }
1938             }
1939           else
1940             goto invalid_code;
1941         }
1942
1943       switch (emacs_mule_bytes[c])
1944         {
1945         case 2:
1946           if ((charset_ID = emacs_mule_charset[c]) < 0)
1947             goto invalid_code;
1948           ONE_MORE_BYTE (c);
1949           if (c < 0xA0)
1950             goto invalid_code;
1951           code = c & 0x7F;
1952           break;
1953
1954         case 3:
1955           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1956               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1957             {
1958               ONE_MORE_BYTE (c);
1959               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1960                 goto invalid_code;
1961               ONE_MORE_BYTE (c);
1962               if (c < 0xA0)
1963                 goto invalid_code;
1964               code = c & 0x7F;
1965             }
1966           else
1967             {
1968               if ((charset_ID = emacs_mule_charset[c]) < 0)
1969                 goto invalid_code;
1970               ONE_MORE_BYTE (c);
1971               if (c < 0xA0)
1972                 goto invalid_code;
1973               code = (c & 0x7F) << 8;
1974               ONE_MORE_BYTE (c);
1975               if (c < 0xA0)
1976                 goto invalid_code;
1977               code |= c & 0x7F;
1978             }
1979           break;
1980
1981         case 4:
1982           ONE_MORE_BYTE (c);
1983           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1984             goto invalid_code;
1985           ONE_MORE_BYTE (c);
1986           if (c < 0xA0)
1987             goto invalid_code;
1988           code = (c & 0x7F) << 8;
1989           ONE_MORE_BYTE (c);
1990           if (c < 0xA0)
1991             goto invalid_code;
1992           code |= c & 0x7F;
1993           break;
1994
1995         case 1:
1996           code = c;
1997           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
1998           break;
1999
2000         default:
2001           emacs_abort ();
2002         }
2003       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2004                           CHARSET_FROM_ID (charset_ID), code, c);
2005       if (c < 0)
2006         goto invalid_code;
2007     }
2008   *nbytes = src - src_base;
2009   *nchars = consumed_chars;
2010   if (id)
2011     *id = charset_ID;
2012   return (mseq_found ? -c : c);
2013
2014  no_more_source:
2015   return -2;
2016
2017  invalid_code:
2018   return -1;
2019 }
2020
2021
2022 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2023
2024 /* Handle these composition sequence ('|': the end of header elements,
2025    BYTES and CHARS >= 0xA0):
2026
2027    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2028    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2029    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2030
2031    and these old form:
2032
2033    (4) relative composition: 0x80 | MSEQ ... MSEQ
2034    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2035
2036    When the starter 0x80 and the following header elements are found,
2037    this annotation header is produced.
2038
2039         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2040
2041    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2042    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2043
2044    Then, upon reading the following elements, these codes are produced
2045    until the composition end is found:
2046
2047    (1) CHAR ... CHAR
2048    (2) ALT ... ALT CHAR ... CHAR
2049    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2050    (4) CHAR ... CHAR
2051    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2052
2053    When the composition end is found, LENGTH and NCHARS in the
2054    annotation header is updated as below:
2055
2056    (1) LENGTH: unchanged, NCHARS: unchanged
2057    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2058    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2059    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2060    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2061
2062    If an error is found while composing, the annotation header is
2063    changed to the original composition header (plus filler -1s) as
2064    below:
2065
2066    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2067    (5)          [ 0x80 0xFF -1 -1- -1 ]
2068
2069    and the sequence [ -2 DECODED-RULE ] is changed to the original
2070    byte sequence as below:
2071         o the original byte sequence is B: [ B -1 ]
2072         o the original byte sequence is B1 B2: [ B1 B2 ]
2073
2074    Most of the routines are implemented by macros because many
2075    variables and labels in the caller decode_coding_emacs_mule must be
2076    accessible, and they are usually called just once (thus doesn't
2077    increase the size of compiled object).  */
2078
2079 /* Decode a composition rule represented by C as a component of
2080    composition sequence of Emacs 20 style.  Set RULE to the decoded
2081    rule. */
2082
2083 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2084   do {                                                  \
2085     int gref, nref;                                     \
2086                                                         \
2087     c -= 0xA0;                                          \
2088     if (c < 0 || c >= 81)                               \
2089       goto invalid_code;                                \
2090     gref = c / 9, nref = c % 9;                         \
2091     if (gref == 4) gref = 10;                           \
2092     if (nref == 4) nref = 10;                           \
2093     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2094   } while (0)
2095
2096
2097 /* Decode a composition rule represented by C and the following byte
2098    at SRC as a component of composition sequence of Emacs 21 style.
2099    Set RULE to the decoded rule.  */
2100
2101 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2102   do {                                                  \
2103     int gref, nref;                                     \
2104                                                         \
2105     gref = c - 0x20;                                    \
2106     if (gref < 0 || gref >= 81)                         \
2107       goto invalid_code;                                \
2108     ONE_MORE_BYTE (c);                                  \
2109     nref = c - 0x20;                                    \
2110     if (nref < 0 || nref >= 81)                         \
2111       goto invalid_code;                                \
2112     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2113   } while (0)
2114
2115
2116 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2117    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2118    byte length of this composition information, CHARS is the number of
2119    characters composed by this composition.  */
2120
2121 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2122   do {                                                                  \
2123     enum composition_method method = c - 0xF2;                          \
2124     int nbytes, nchars;                                                 \
2125                                                                         \
2126     ONE_MORE_BYTE (c);                                                  \
2127     if (c < 0)                                                          \
2128       goto invalid_code;                                                \
2129     nbytes = c - 0xA0;                                                  \
2130     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2131       goto invalid_code;                                                \
2132     ONE_MORE_BYTE (c);                                                  \
2133     nchars = c - 0xA0;                                                  \
2134     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2135       goto invalid_code;                                                \
2136     cmp_status->old_form = 0;                                           \
2137     cmp_status->method = method;                                        \
2138     if (method == COMPOSITION_RELATIVE)                                 \
2139       cmp_status->state = COMPOSING_CHAR;                               \
2140     else                                                                \
2141       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2142     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2143     cmp_status->nchars = nchars;                                        \
2144     cmp_status->ncomps = nbytes - 4;                                    \
2145     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2146   } while (0)
2147
2148
2149 /* Start of Emacs 20 style format for relative composition.  */
2150
2151 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2152   do {                                                          \
2153     cmp_status->old_form = 1;                                   \
2154     cmp_status->method = COMPOSITION_RELATIVE;                  \
2155     cmp_status->state = COMPOSING_CHAR;                         \
2156     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2157     cmp_status->nchars = cmp_status->ncomps = 0;                \
2158     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2159   } while (0)
2160
2161
2162 /* Start of Emacs 20 style format for rule-base composition.  */
2163
2164 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2165   do {                                                          \
2166     cmp_status->old_form = 1;                                   \
2167     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2168     cmp_status->state = COMPOSING_CHAR;                         \
2169     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2170     cmp_status->nchars = cmp_status->ncomps = 0;                \
2171     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2172   } while (0)
2173
2174
2175 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2176   do {                                                  \
2177     const unsigned char *current_src = src;             \
2178                                                         \
2179     ONE_MORE_BYTE (c);                                  \
2180     if (c < 0)                                          \
2181       goto invalid_code;                                \
2182     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2183         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2184       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2185     else if (c < 0xA0)                                  \
2186       goto invalid_code;                                \
2187     else if (c < 0xC0)                                  \
2188       {                                                 \
2189         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2190         /* Re-read C as a composition component.  */    \
2191         src = current_src;                              \
2192       }                                                 \
2193     else if (c == 0xFF)                                 \
2194       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2195     else                                                \
2196       goto invalid_code;                                \
2197   } while (0)
2198
2199 #define EMACS_MULE_COMPOSITION_END()                            \
2200   do {                                                          \
2201     int idx = - cmp_status->length;                             \
2202                                                                 \
2203     if (cmp_status->old_form)                                   \
2204       charbuf[idx + 2] = cmp_status->nchars;                    \
2205     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2206       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2207     cmp_status->state = COMPOSING_NO;                           \
2208   } while (0)
2209
2210
2211 static int
2212 emacs_mule_finish_composition (int *charbuf,
2213                                struct composition_status *cmp_status)
2214 {
2215   int idx = - cmp_status->length;
2216   int new_chars;
2217
2218   if (cmp_status->old_form && cmp_status->nchars > 0)
2219     {
2220       charbuf[idx + 2] = cmp_status->nchars;
2221       new_chars = 0;
2222       if (cmp_status->method == COMPOSITION_WITH_RULE
2223           && cmp_status->state == COMPOSING_CHAR)
2224         {
2225           /* The last rule was invalid.  */
2226           int rule = charbuf[-1] + 0xA0;
2227
2228           charbuf[-2] = BYTE8_TO_CHAR (rule);
2229           charbuf[-1] = -1;
2230           new_chars = 1;
2231         }
2232     }
2233   else
2234     {
2235       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2236
2237       if (cmp_status->method == COMPOSITION_WITH_RULE)
2238         {
2239           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2240           charbuf[idx++] = -3;
2241           charbuf[idx++] = 0;
2242           new_chars = 1;
2243         }
2244       else
2245         {
2246           int nchars = charbuf[idx + 1] + 0xA0;
2247           int nbytes = charbuf[idx + 2] + 0xA0;
2248
2249           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2250           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2251           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2252           charbuf[idx++] = -1;
2253           new_chars = 4;
2254         }
2255     }
2256   cmp_status->state = COMPOSING_NO;
2257   return new_chars;
2258 }
2259
2260 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2261   do {                                                                    \
2262     if (cmp_status->state != COMPOSING_NO)                                \
2263       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2264   } while (0)
2265
2266
2267 static void
2268 decode_coding_emacs_mule (struct coding_system *coding)
2269 {
2270   const unsigned char *src = coding->source + coding->consumed;
2271   const unsigned char *src_end = coding->source + coding->src_bytes;
2272   const unsigned char *src_base;
2273   int *charbuf = coding->charbuf + coding->charbuf_used;
2274   /* We may produce two annotations (charset and composition) in one
2275      loop and one more charset annotation at the end.  */
2276   int *charbuf_end
2277     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2278       /* We can produce up to 2 characters in a loop.  */
2279       - 1;
2280   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2281   bool multibytep = coding->src_multibyte;
2282   ptrdiff_t char_offset = coding->produced_char;
2283   ptrdiff_t last_offset = char_offset;
2284   int last_id = charset_ascii;
2285   bool eol_dos
2286     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2287   int byte_after_cr = -1;
2288   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2289
2290   if (cmp_status->state != COMPOSING_NO)
2291     {
2292       int i;
2293
2294       if (charbuf_end - charbuf < cmp_status->length)
2295         emacs_abort ();
2296       for (i = 0; i < cmp_status->length; i++)
2297         *charbuf++ = cmp_status->carryover[i];
2298       coding->annotated = 1;
2299     }
2300
2301   while (1)
2302     {
2303       int c, id IF_LINT (= 0);
2304
2305       src_base = src;
2306       consumed_chars_base = consumed_chars;
2307
2308       if (charbuf >= charbuf_end)
2309         {
2310           if (byte_after_cr >= 0)
2311             src_base--;
2312           break;
2313         }
2314
2315       if (byte_after_cr >= 0)
2316         c = byte_after_cr, byte_after_cr = -1;
2317       else
2318         ONE_MORE_BYTE (c);
2319
2320       if (c < 0 || c == 0x80)
2321         {
2322           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2323           if (c < 0)
2324             {
2325               *charbuf++ = -c;
2326               char_offset++;
2327             }
2328           else
2329             DECODE_EMACS_MULE_COMPOSITION_START ();
2330           continue;
2331         }
2332
2333       if (c < 0x80)
2334         {
2335           if (eol_dos && c == '\r')
2336             ONE_MORE_BYTE (byte_after_cr);
2337           id = charset_ascii;
2338           if (cmp_status->state != COMPOSING_NO)
2339             {
2340               if (cmp_status->old_form)
2341                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2342               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2343                 cmp_status->ncomps--;
2344             }
2345         }
2346       else
2347         {
2348           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2349           /* emacs_mule_char can load a charset map from a file, which
2350              allocates a large structure and might cause buffer text
2351              to be relocated as result.  Thus, we need to remember the
2352              original pointer to buffer text, and fix up all related
2353              pointers after the call.  */
2354           const unsigned char *orig = coding->source;
2355           ptrdiff_t offset;
2356
2357           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2358                                cmp_status);
2359           offset = coding->source - orig;
2360           if (offset)
2361             {
2362               src += offset;
2363               src_base += offset;
2364               src_end += offset;
2365             }
2366           if (c < 0)
2367             {
2368               if (c == -1)
2369                 goto invalid_code;
2370               if (c == -2)
2371                 break;
2372             }
2373           src = src_base + nbytes;
2374           consumed_chars = consumed_chars_base + nchars;
2375           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2376             cmp_status->ncomps -= nchars;
2377         }
2378
2379       /* Now if C >= 0, we found a normally encoded character, if C <
2380          0, we found an old-style composition component character or
2381          rule.  */
2382
2383       if (cmp_status->state == COMPOSING_NO)
2384         {
2385           if (last_id != id)
2386             {
2387               if (last_id != charset_ascii)
2388                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2389                                   last_id);
2390               last_id = id;
2391               last_offset = char_offset;
2392             }
2393           *charbuf++ = c;
2394           char_offset++;
2395         }
2396       else if (cmp_status->state == COMPOSING_CHAR)
2397         {
2398           if (cmp_status->old_form)
2399             {
2400               if (c >= 0)
2401                 {
2402                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2403                   *charbuf++ = c;
2404                   char_offset++;
2405                 }
2406               else
2407                 {
2408                   *charbuf++ = -c;
2409                   cmp_status->nchars++;
2410                   cmp_status->length++;
2411                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2412                     EMACS_MULE_COMPOSITION_END ();
2413                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2414                     cmp_status->state = COMPOSING_RULE;
2415                 }
2416             }
2417           else
2418             {
2419               *charbuf++ = c;
2420               cmp_status->length++;
2421               cmp_status->nchars--;
2422               if (cmp_status->nchars == 0)
2423                 EMACS_MULE_COMPOSITION_END ();
2424             }
2425         }
2426       else if (cmp_status->state == COMPOSING_RULE)
2427         {
2428           int rule;
2429
2430           if (c >= 0)
2431             {
2432               EMACS_MULE_COMPOSITION_END ();
2433               *charbuf++ = c;
2434               char_offset++;
2435             }
2436           else
2437             {
2438               c = -c;
2439               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2440               if (rule < 0)
2441                 goto invalid_code;
2442               *charbuf++ = -2;
2443               *charbuf++ = rule;
2444               cmp_status->length += 2;
2445               cmp_status->state = COMPOSING_CHAR;
2446             }
2447         }
2448       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2449         {
2450           *charbuf++ = c;
2451           cmp_status->length++;
2452           if (cmp_status->ncomps == 0)
2453             cmp_status->state = COMPOSING_CHAR;
2454           else if (cmp_status->ncomps > 0)
2455             {
2456               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2457                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2458             }
2459           else
2460             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2461         }
2462       else                      /* COMPOSING_COMPONENT_RULE */
2463         {
2464           int rule;
2465
2466           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2467           if (rule < 0)
2468             goto invalid_code;
2469           *charbuf++ = -2;
2470           *charbuf++ = rule;
2471           cmp_status->length += 2;
2472           cmp_status->ncomps--;
2473           if (cmp_status->ncomps > 0)
2474             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2475           else
2476             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2477         }
2478       continue;
2479
2480     invalid_code:
2481       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2482       src = src_base;
2483       consumed_chars = consumed_chars_base;
2484       ONE_MORE_BYTE (c);
2485       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2486       char_offset++;
2487       coding->errors++;
2488     }
2489
2490  no_more_source:
2491   if (cmp_status->state != COMPOSING_NO)
2492     {
2493       if (coding->mode & CODING_MODE_LAST_BLOCK)
2494         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2495       else
2496         {
2497           int i;
2498
2499           charbuf -= cmp_status->length;
2500           for (i = 0; i < cmp_status->length; i++)
2501             cmp_status->carryover[i] = charbuf[i];
2502         }
2503     }
2504   if (last_id != charset_ascii)
2505     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2506   coding->consumed_char += consumed_chars_base;
2507   coding->consumed = src_base - coding->source;
2508   coding->charbuf_used = charbuf - coding->charbuf;
2509 }
2510
2511
2512 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2513   do {                                          \
2514     if (id < 0xA0)                              \
2515       codes[0] = id, codes[1] = 0;              \
2516     else if (id < 0xE0)                         \
2517       codes[0] = 0x9A, codes[1] = id;           \
2518     else if (id < 0xF0)                         \
2519       codes[0] = 0x9B, codes[1] = id;           \
2520     else if (id < 0xF5)                         \
2521       codes[0] = 0x9C, codes[1] = id;           \
2522     else                                        \
2523       codes[0] = 0x9D, codes[1] = id;           \
2524   } while (0);
2525
2526
2527 static bool
2528 encode_coding_emacs_mule (struct coding_system *coding)
2529 {
2530   bool multibytep = coding->dst_multibyte;
2531   int *charbuf = coding->charbuf;
2532   int *charbuf_end = charbuf + coding->charbuf_used;
2533   unsigned char *dst = coding->destination + coding->produced;
2534   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2535   int safe_room = 8;
2536   ptrdiff_t produced_chars = 0;
2537   Lisp_Object attrs, charset_list;
2538   int c;
2539   int preferred_charset_id = -1;
2540
2541   CODING_GET_INFO (coding, attrs, charset_list);
2542   if (! EQ (charset_list, Vemacs_mule_charset_list))
2543     {
2544       charset_list = Vemacs_mule_charset_list;
2545       ASET (attrs, coding_attr_charset_list, charset_list);
2546     }
2547
2548   while (charbuf < charbuf_end)
2549     {
2550       ASSURE_DESTINATION (safe_room);
2551       c = *charbuf++;
2552
2553       if (c < 0)
2554         {
2555           /* Handle an annotation.  */
2556           switch (*charbuf)
2557             {
2558             case CODING_ANNOTATE_COMPOSITION_MASK:
2559               /* Not yet implemented.  */
2560               break;
2561             case CODING_ANNOTATE_CHARSET_MASK:
2562               preferred_charset_id = charbuf[3];
2563               if (preferred_charset_id >= 0
2564                   && NILP (Fmemq (make_number (preferred_charset_id),
2565                                   charset_list)))
2566                 preferred_charset_id = -1;
2567               break;
2568             default:
2569               emacs_abort ();
2570             }
2571           charbuf += -c - 1;
2572           continue;
2573         }
2574
2575       if (ASCII_CHAR_P (c))
2576         EMIT_ONE_ASCII_BYTE (c);
2577       else if (CHAR_BYTE8_P (c))
2578         {
2579           c = CHAR_TO_BYTE8 (c);
2580           EMIT_ONE_BYTE (c);
2581         }
2582       else
2583         {
2584           struct charset *charset;
2585           unsigned code;
2586           int dimension;
2587           int emacs_mule_id;
2588           unsigned char leading_codes[2];
2589
2590           if (preferred_charset_id >= 0)
2591             {
2592               bool result;
2593
2594               charset = CHARSET_FROM_ID (preferred_charset_id);
2595               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2596               if (result)
2597                 code = ENCODE_CHAR (charset, c);
2598               else
2599                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2600                                      &code, charset);
2601             }
2602           else
2603             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2604                                  &code, charset);
2605           if (! charset)
2606             {
2607               c = coding->default_char;
2608               if (ASCII_CHAR_P (c))
2609                 {
2610                   EMIT_ONE_ASCII_BYTE (c);
2611                   continue;
2612                 }
2613               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2614                                    &code, charset);
2615             }
2616           dimension = CHARSET_DIMENSION (charset);
2617           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2618           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2619           EMIT_ONE_BYTE (leading_codes[0]);
2620           if (leading_codes[1])
2621             EMIT_ONE_BYTE (leading_codes[1]);
2622           if (dimension == 1)
2623             EMIT_ONE_BYTE (code | 0x80);
2624           else
2625             {
2626               code |= 0x8080;
2627               EMIT_ONE_BYTE (code >> 8);
2628               EMIT_ONE_BYTE (code & 0xFF);
2629             }
2630         }
2631     }
2632   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2633   coding->produced_char += produced_chars;
2634   coding->produced = dst - coding->destination;
2635   return 0;
2636 }
2637
2638 \f
2639 /*** 7. ISO2022 handlers ***/
2640
2641 /* The following note describes the coding system ISO2022 briefly.
2642    Since the intention of this note is to help understand the
2643    functions in this file, some parts are NOT ACCURATE or are OVERLY
2644    SIMPLIFIED.  For thorough understanding, please refer to the
2645    original document of ISO2022.  This is equivalent to the standard
2646    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2647
2648    ISO2022 provides many mechanisms to encode several character sets
2649    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2650    is encoded using bytes less than 128.  This may make the encoded
2651    text a little bit longer, but the text passes more easily through
2652    several types of gateway, some of which strip off the MSB (Most
2653    Significant Bit).
2654
2655    There are two kinds of character sets: control character sets and
2656    graphic character sets.  The former contain control characters such
2657    as `newline' and `escape' to provide control functions (control
2658    functions are also provided by escape sequences).  The latter
2659    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2660    two control character sets and many graphic character sets.
2661
2662    Graphic character sets are classified into one of the following
2663    four classes, according to the number of bytes (DIMENSION) and
2664    number of characters in one dimension (CHARS) of the set:
2665    - DIMENSION1_CHARS94
2666    - DIMENSION1_CHARS96
2667    - DIMENSION2_CHARS94
2668    - DIMENSION2_CHARS96
2669
2670    In addition, each character set is assigned an identification tag,
2671    unique for each set, called the "final character" (denoted as <F>
2672    hereafter).  The <F> of each character set is decided by ECMA(*)
2673    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2674    (0x30..0x3F are for private use only).
2675
2676    Note (*): ECMA = European Computer Manufacturers Association
2677
2678    Here are examples of graphic character sets [NAME(<F>)]:
2679         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2680         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2681         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2682         o DIMENSION2_CHARS96 -- none for the moment
2683
2684    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2685         C0 [0x00..0x1F] -- control character plane 0
2686         GL [0x20..0x7F] -- graphic character plane 0
2687         C1 [0x80..0x9F] -- control character plane 1
2688         GR [0xA0..0xFF] -- graphic character plane 1
2689
2690    A control character set is directly designated and invoked to C0 or
2691    C1 by an escape sequence.  The most common case is that:
2692    - ISO646's  control character set is designated/invoked to C0, and
2693    - ISO6429's control character set is designated/invoked to C1,
2694    and usually these designations/invocations are omitted in encoded
2695    text.  In a 7-bit environment, only C0 can be used, and a control
2696    character for C1 is encoded by an appropriate escape sequence to
2697    fit into the environment.  All control characters for C1 are
2698    defined to have corresponding escape sequences.
2699
2700    A graphic character set is at first designated to one of four
2701    graphic registers (G0 through G3), then these graphic registers are
2702    invoked to GL or GR.  These designations and invocations can be
2703    done independently.  The most common case is that G0 is invoked to
2704    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2705    these invocations and designations are omitted in encoded text.
2706    In a 7-bit environment, only GL can be used.
2707
2708    When a graphic character set of CHARS94 is invoked to GL, codes
2709    0x20 and 0x7F of the GL area work as control characters SPACE and
2710    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2711    be used.
2712
2713    There are two ways of invocation: locking-shift and single-shift.
2714    With locking-shift, the invocation lasts until the next different
2715    invocation, whereas with single-shift, the invocation affects the
2716    following character only and doesn't affect the locking-shift
2717    state.  Invocations are done by the following control characters or
2718    escape sequences:
2719
2720    ----------------------------------------------------------------------
2721    abbrev  function                  cntrl escape seq   description
2722    ----------------------------------------------------------------------
2723    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2724    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2725    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2726    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2727    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2728    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2729    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2730    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2731    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2732    ----------------------------------------------------------------------
2733    (*) These are not used by any known coding system.
2734
2735    Control characters for these functions are defined by macros
2736    ISO_CODE_XXX in `coding.h'.
2737
2738    Designations are done by the following escape sequences:
2739    ----------------------------------------------------------------------
2740    escape sequence      description
2741    ----------------------------------------------------------------------
2742    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2743    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2744    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2745    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2746    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2747    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2748    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2749    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2750    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2751    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2752    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2753    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2754    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2755    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2756    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2757    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2758    ----------------------------------------------------------------------
2759
2760    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2761    of dimension 1, chars 94, and final character <F>, etc...
2762
2763    Note (*): Although these designations are not allowed in ISO2022,
2764    Emacs accepts them on decoding, and produces them on encoding
2765    CHARS96 character sets in a coding system which is characterized as
2766    7-bit environment, non-locking-shift, and non-single-shift.
2767
2768    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2769    '(' must be omitted.  We refer to this as "short-form" hereafter.
2770
2771    Now you may notice that there are a lot of ways of encoding the
2772    same multilingual text in ISO2022.  Actually, there exist many
2773    coding systems such as Compound Text (used in X11's inter client
2774    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2775    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2776    localized platforms), and all of these are variants of ISO2022.
2777
2778    In addition to the above, Emacs handles two more kinds of escape
2779    sequences: ISO6429's direction specification and Emacs' private
2780    sequence for specifying character composition.
2781
2782    ISO6429's direction specification takes the following form:
2783         o CSI ']'      -- end of the current direction
2784         o CSI '0' ']'  -- end of the current direction
2785         o CSI '1' ']'  -- start of left-to-right text
2786         o CSI '2' ']'  -- start of right-to-left text
2787    The control character CSI (0x9B: control sequence introducer) is
2788    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2789
2790    Character composition specification takes the following form:
2791         o ESC '0' -- start relative composition
2792         o ESC '1' -- end composition
2793         o ESC '2' -- start rule-base composition (*)
2794         o ESC '3' -- start relative composition with alternate chars  (**)
2795         o ESC '4' -- start rule-base composition with alternate chars  (**)
2796   Since these are not standard escape sequences of any ISO standard,
2797   the use of them with these meanings is restricted to Emacs only.
2798
2799   (*) This form is used only in Emacs 20.7 and older versions,
2800   but newer versions can safely decode it.
2801   (**) This form is used only in Emacs 21.1 and newer versions,
2802   and older versions can't decode it.
2803
2804   Here's a list of example usages of these composition escape
2805   sequences (categorized by `enum composition_method').
2806
2807   COMPOSITION_RELATIVE:
2808         ESC 0 CHAR [ CHAR ] ESC 1
2809   COMPOSITION_WITH_RULE:
2810         ESC 2 CHAR [ RULE CHAR ] ESC 1
2811   COMPOSITION_WITH_ALTCHARS:
2812         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2813   COMPOSITION_WITH_RULE_ALTCHARS:
2814         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2815
2816 static enum iso_code_class_type iso_code_class[256];
2817
2818 #define SAFE_CHARSET_P(coding, id)      \
2819   ((id) <= (coding)->max_charset_id     \
2820    && (coding)->safe_charsets[id] != 255)
2821
2822 static void
2823 setup_iso_safe_charsets (Lisp_Object attrs)
2824 {
2825   Lisp_Object charset_list, safe_charsets;
2826   Lisp_Object request;
2827   Lisp_Object reg_usage;
2828   Lisp_Object tail;
2829   EMACS_INT reg94, reg96;
2830   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2831   int max_charset_id;
2832
2833   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2834   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2835       && ! EQ (charset_list, Viso_2022_charset_list))
2836     {
2837       charset_list = Viso_2022_charset_list;
2838       ASET (attrs, coding_attr_charset_list, charset_list);
2839       ASET (attrs, coding_attr_safe_charsets, Qnil);
2840     }
2841
2842   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2843     return;
2844
2845   max_charset_id = 0;
2846   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2847     {
2848       int id = XINT (XCAR (tail));
2849       if (max_charset_id < id)
2850         max_charset_id = id;
2851     }
2852
2853   safe_charsets = make_uninit_string (max_charset_id + 1);
2854   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2855   request = AREF (attrs, coding_attr_iso_request);
2856   reg_usage = AREF (attrs, coding_attr_iso_usage);
2857   reg94 = XINT (XCAR (reg_usage));
2858   reg96 = XINT (XCDR (reg_usage));
2859
2860   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2861     {
2862       Lisp_Object id;
2863       Lisp_Object reg;
2864       struct charset *charset;
2865
2866       id = XCAR (tail);
2867       charset = CHARSET_FROM_ID (XINT (id));
2868       reg = Fcdr (Fassq (id, request));
2869       if (! NILP (reg))
2870         SSET (safe_charsets, XINT (id), XINT (reg));
2871       else if (charset->iso_chars_96)
2872         {
2873           if (reg96 < 4)
2874             SSET (safe_charsets, XINT (id), reg96);
2875         }
2876       else
2877         {
2878           if (reg94 < 4)
2879             SSET (safe_charsets, XINT (id), reg94);
2880         }
2881     }
2882   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2883 }
2884
2885
2886 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2887    Return true if a text is encoded in one of ISO-2022 based coding
2888    systems.  */
2889
2890 static bool
2891 detect_coding_iso_2022 (struct coding_system *coding,
2892                         struct coding_detection_info *detect_info)
2893 {
2894   const unsigned char *src = coding->source, *src_base = src;
2895   const unsigned char *src_end = coding->source + coding->src_bytes;
2896   bool multibytep = coding->src_multibyte;
2897   bool single_shifting = 0;
2898   int id;
2899   int c, c1;
2900   ptrdiff_t consumed_chars = 0;
2901   int i;
2902   int rejected = 0;
2903   int found = 0;
2904   int composition_count = -1;
2905
2906   detect_info->checked |= CATEGORY_MASK_ISO;
2907
2908   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2909     {
2910       struct coding_system *this = &(coding_categories[i]);
2911       Lisp_Object attrs, val;
2912
2913       if (this->id < 0)
2914         continue;
2915       attrs = CODING_ID_ATTRS (this->id);
2916       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2917           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2918         setup_iso_safe_charsets (attrs);
2919       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2920       this->max_charset_id = SCHARS (val) - 1;
2921       this->safe_charsets = SDATA (val);
2922     }
2923
2924   /* A coding system of this category is always ASCII compatible.  */
2925   src += coding->head_ascii;
2926
2927   while (rejected != CATEGORY_MASK_ISO)
2928     {
2929       src_base = src;
2930       ONE_MORE_BYTE (c);
2931       switch (c)
2932         {
2933         case ISO_CODE_ESC:
2934           if (inhibit_iso_escape_detection)
2935             break;
2936           single_shifting = 0;
2937           ONE_MORE_BYTE (c);
2938           if (c == 'N' || c == 'O')
2939             {
2940               /* ESC <Fe> for SS2 or SS3.  */
2941               single_shifting = 1;
2942               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2943             }
2944           else if (c == '1')
2945             {
2946               /* End of composition.  */
2947               if (composition_count < 0
2948                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2949                 /* Invalid */
2950                 break;
2951               composition_count = -1;
2952               found |= CATEGORY_MASK_ISO;
2953             }
2954           else if (c >= '0' && c <= '4')
2955             {
2956               /* ESC <Fp> for start/end composition.  */
2957               composition_count = 0;
2958             }
2959           else
2960             {
2961               if (c >= '(' && c <= '/')
2962                 {
2963                   /* Designation sequence for a charset of dimension 1.  */
2964                   ONE_MORE_BYTE (c1);
2965                   if (c1 < ' ' || c1 >= 0x80
2966                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2967                     /* Invalid designation sequence.  Just ignore.  */
2968                     break;
2969                 }
2970               else if (c == '$')
2971                 {
2972                   /* Designation sequence for a charset of dimension 2.  */
2973                   ONE_MORE_BYTE (c);
2974                   if (c >= '@' && c <= 'B')
2975                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2976                     id = iso_charset_table[1][0][c];
2977                   else if (c >= '(' && c <= '/')
2978                     {
2979                       ONE_MORE_BYTE (c1);
2980                       if (c1 < ' ' || c1 >= 0x80
2981                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2982                         /* Invalid designation sequence.  Just ignore.  */
2983                         break;
2984                     }
2985                   else
2986                     /* Invalid designation sequence.  Just ignore it.  */
2987                     break;
2988                 }
2989               else
2990                 {
2991                   /* Invalid escape sequence.  Just ignore it.  */
2992                   break;
2993                 }
2994
2995               /* We found a valid designation sequence for CHARSET.  */
2996               rejected |= CATEGORY_MASK_ISO_8BIT;
2997               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2998                                   id))
2999                 found |= CATEGORY_MASK_ISO_7;
3000               else
3001                 rejected |= CATEGORY_MASK_ISO_7;
3002               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3003                                   id))
3004                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3005               else
3006                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3007               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3008                                   id))
3009                 found |= CATEGORY_MASK_ISO_7_ELSE;
3010               else
3011                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3012               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3013                                   id))
3014                 found |= CATEGORY_MASK_ISO_8_ELSE;
3015               else
3016                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3017             }
3018           break;
3019
3020         case ISO_CODE_SO:
3021         case ISO_CODE_SI:
3022           /* Locking shift out/in.  */
3023           if (inhibit_iso_escape_detection)
3024             break;
3025           single_shifting = 0;
3026           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3027           break;
3028
3029         case ISO_CODE_CSI:
3030           /* Control sequence introducer.  */
3031           single_shifting = 0;
3032           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3033           found |= CATEGORY_MASK_ISO_8_ELSE;
3034           goto check_extra_latin;
3035
3036         case ISO_CODE_SS2:
3037         case ISO_CODE_SS3:
3038           /* Single shift.   */
3039           if (inhibit_iso_escape_detection)
3040             break;
3041           single_shifting = 0;
3042           rejected |= CATEGORY_MASK_ISO_7BIT;
3043           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3044               & CODING_ISO_FLAG_SINGLE_SHIFT)
3045             {
3046               found |= CATEGORY_MASK_ISO_8_1;
3047               single_shifting = 1;
3048             }
3049           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3050               & CODING_ISO_FLAG_SINGLE_SHIFT)
3051             {
3052               found |= CATEGORY_MASK_ISO_8_2;
3053               single_shifting = 1;
3054             }
3055           if (single_shifting)
3056             break;
3057           goto check_extra_latin;
3058
3059         default:
3060           if (c < 0)
3061             continue;
3062           if (c < 0x80)
3063             {
3064               if (composition_count >= 0)
3065                 composition_count++;
3066               single_shifting = 0;
3067               break;
3068             }
3069           if (c >= 0xA0)
3070             {
3071               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3072               found |= CATEGORY_MASK_ISO_8_1;
3073               /* Check the length of succeeding codes of the range
3074                  0xA0..0FF.  If the byte length is even, we include
3075                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3076                  only when we are not single shifting.  */
3077               if (! single_shifting
3078                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3079                 {
3080                   int len = 1;
3081                   while (src < src_end)
3082                     {
3083                       src_base = src;
3084                       ONE_MORE_BYTE (c);
3085                       if (c < 0xA0)
3086                         {
3087                           src = src_base;
3088                           break;
3089                         }
3090                       len++;
3091                     }
3092
3093                   if (len & 1 && src < src_end)
3094                     {
3095                       rejected |= CATEGORY_MASK_ISO_8_2;
3096                       if (composition_count >= 0)
3097                         composition_count += len;
3098                     }
3099                   else
3100                     {
3101                       found |= CATEGORY_MASK_ISO_8_2;
3102                       if (composition_count >= 0)
3103                         composition_count += len / 2;
3104                     }
3105                 }
3106               break;
3107             }
3108         check_extra_latin:
3109           if (! VECTORP (Vlatin_extra_code_table)
3110               || NILP (AREF (Vlatin_extra_code_table, c)))
3111             {
3112               rejected = CATEGORY_MASK_ISO;
3113               break;
3114             }
3115           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3116               & CODING_ISO_FLAG_LATIN_EXTRA)
3117             found |= CATEGORY_MASK_ISO_8_1;
3118           else
3119             rejected |= CATEGORY_MASK_ISO_8_1;
3120           rejected |= CATEGORY_MASK_ISO_8_2;
3121           break;
3122         }
3123     }
3124   detect_info->rejected |= CATEGORY_MASK_ISO;
3125   return 0;
3126
3127  no_more_source:
3128   detect_info->rejected |= rejected;
3129   detect_info->found |= (found & ~rejected);
3130   return 1;
3131 }
3132
3133
3134 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3135    escape sequence should be kept.  */
3136 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3137   do {                                                                  \
3138     int id, prev;                                                       \
3139                                                                         \
3140     if (final < '0' || final >= 128                                     \
3141         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3142         || !SAFE_CHARSET_P (coding, id))                                \
3143       {                                                                 \
3144         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3145         chars_96 = -1;                                                  \
3146         break;                                                          \
3147       }                                                                 \
3148     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3149     if (id == charset_jisx0201_roman)                                   \
3150       {                                                                 \
3151         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3152           id = charset_ascii;                                           \
3153       }                                                                 \
3154     else if (id == charset_jisx0208_1978)                               \
3155       {                                                                 \
3156         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3157           id = charset_jisx0208;                                        \
3158       }                                                                 \
3159     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3160     /* If there was an invalid designation to REG previously, and this  \
3161        designation is ASCII to REG, we should keep this designation     \
3162        sequence.  */                                                    \
3163     if (prev == -2 && id == charset_ascii)                              \
3164       chars_96 = -1;                                                    \
3165   } while (0)
3166
3167
3168 /* Handle these composition sequence (ALT: alternate char):
3169
3170    (1) relative composition: ESC 0 CHAR ... ESC 1
3171    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3172    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3173    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3174
3175    When the start sequence (ESC 0/2/3/4) is found, this annotation
3176    header is produced.
3177
3178         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3179
3180    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3181    produced until the end sequence (ESC 1) is found:
3182
3183    (1) CHAR ... CHAR
3184    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3185    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3186    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3187
3188    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3189    annotation header is updated as below:
3190
3191    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3192    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3193    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3194    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3195
3196    If an error is found while composing, the annotation header is
3197    changed to:
3198
3199         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3200
3201    and the sequence [ -2 DECODED-RULE ] is changed to the original
3202    byte sequence as below:
3203         o the original byte sequence is B: [ B -1 ]
3204         o the original byte sequence is B1 B2: [ B1 B2 ]
3205    and the sequence [ -1 -1 ] is changed to the original byte
3206    sequence:
3207         [ ESC '0' ]
3208 */
3209
3210 /* Decode a composition rule C1 and maybe one more byte from the
3211    source, and set RULE to the encoded composition rule.  If the rule
3212    is invalid, goto invalid_code.  */
3213
3214 #define DECODE_COMPOSITION_RULE(rule)                                   \
3215   do {                                                                  \
3216     rule = c1 - 32;                                                     \
3217     if (rule < 0)                                                       \
3218       goto invalid_code;                                                \
3219     if (rule < 81)              /* old format (before ver.21) */        \
3220       {                                                                 \
3221         int gref = (rule) / 9;                                          \
3222         int nref = (rule) % 9;                                          \
3223         if (gref == 4) gref = 10;                                       \
3224         if (nref == 4) nref = 10;                                       \
3225         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3226       }                                                                 \
3227     else                        /* new format (after ver.21) */         \
3228       {                                                                 \
3229         int b;                                                          \
3230                                                                         \
3231         ONE_MORE_BYTE (b);                                              \
3232         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3233           goto invalid_code;                                            \
3234         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3235         rule += 0x100;   /* Distinguish it from the old format.  */     \
3236       }                                                                 \
3237   } while (0)
3238
3239 #define ENCODE_COMPOSITION_RULE(rule)                           \
3240   do {                                                          \
3241     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3242                                                                 \
3243     if (rule < 0x100)           /* old format */                \
3244       {                                                         \
3245         if (gref == 10) gref = 4;                               \
3246         if (nref == 10) nref = 4;                               \
3247         charbuf[idx] = 32 + gref * 9 + nref;                    \
3248         charbuf[idx + 1] = -1;                                  \
3249         new_chars++;                                            \
3250       }                                                         \
3251     else                                /* new format */        \
3252       {                                                         \
3253         charbuf[idx] = 32 + 81 + gref;                          \
3254         charbuf[idx + 1] = 32 + nref;                           \
3255         new_chars += 2;                                         \
3256       }                                                         \
3257   } while (0)
3258
3259 /* Finish the current composition as invalid.  */
3260
3261 static int
3262 finish_composition (int *charbuf, struct composition_status *cmp_status)
3263 {
3264   int idx = - cmp_status->length;
3265   int new_chars;
3266
3267   /* Recover the original ESC sequence */
3268   charbuf[idx++] = ISO_CODE_ESC;
3269   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3270                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3271                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3272                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3273                     : '4');
3274   charbuf[idx++] = -2;
3275   charbuf[idx++] = 0;
3276   charbuf[idx++] = -1;
3277   new_chars = cmp_status->nchars;
3278   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3279     for (; idx < 0; idx++)
3280       {
3281         int elt = charbuf[idx];
3282
3283         if (elt == -2)
3284           {
3285             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3286             idx++;
3287           }
3288         else if (elt == -1)
3289           {
3290             charbuf[idx++] = ISO_CODE_ESC;
3291             charbuf[idx] = '0';
3292             new_chars += 2;
3293           }
3294       }
3295   cmp_status->state = COMPOSING_NO;
3296   return new_chars;
3297 }
3298
3299 /* If characters are under composition, finish the composition.  */
3300 #define MAYBE_FINISH_COMPOSITION()                              \
3301   do {                                                          \
3302     if (cmp_status->state != COMPOSING_NO)                      \
3303       char_offset += finish_composition (charbuf, cmp_status);  \
3304   } while (0)
3305
3306 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3307
3308    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3309    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3310    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3311    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3312
3313    Produce this annotation sequence now:
3314
3315    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3316 */
3317
3318 #define DECODE_COMPOSITION_START(c1)                                       \
3319   do {                                                                     \
3320     if (c1 == '0'                                                          \
3321         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3322              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3323             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3324                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3325       {                                                                    \
3326         *charbuf++ = -1;                                                   \
3327         *charbuf++= -1;                                                    \
3328         cmp_status->state = COMPOSING_CHAR;                                \
3329         cmp_status->length += 2;                                           \
3330       }                                                                    \
3331     else                                                                   \
3332       {                                                                    \
3333         MAYBE_FINISH_COMPOSITION ();                                       \
3334         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3335                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3336                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3337                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3338         cmp_status->state                                                  \
3339           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3340         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3341         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3342         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3343         coding->annotated = 1;                                             \
3344       }                                                                    \
3345   } while (0)
3346
3347
3348 /* Handle composition end sequence ESC 1.  */
3349
3350 #define DECODE_COMPOSITION_END()                                        \
3351   do {                                                                  \
3352     if (cmp_status->nchars == 0                                         \
3353         || ((cmp_status->state == COMPOSING_CHAR)                       \
3354             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3355       {                                                                 \
3356         MAYBE_FINISH_COMPOSITION ();                                    \
3357         goto invalid_code;                                              \
3358       }                                                                 \
3359     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3360       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3361     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3362       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3363     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3364     char_offset += cmp_status->nchars;                                  \
3365     cmp_status->state = COMPOSING_NO;                                   \
3366   } while (0)
3367
3368 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3369
3370 #define STORE_COMPOSITION_RULE(rule)    \
3371   do {                                  \
3372     *charbuf++ = -2;                    \
3373     *charbuf++ = rule;                  \
3374     cmp_status->length += 2;            \
3375     cmp_status->state--;                \
3376   } while (0)
3377
3378 /* Store a composed char or a component char C in charbuf, and update
3379    cmp_status.  */
3380
3381 #define STORE_COMPOSITION_CHAR(c)                                       \
3382   do {                                                                  \
3383     *charbuf++ = (c);                                                   \
3384     cmp_status->length++;                                               \
3385     if (cmp_status->state == COMPOSING_CHAR)                            \
3386       cmp_status->nchars++;                                             \
3387     else                                                                \
3388       cmp_status->ncomps++;                                             \
3389     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3390         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3391             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3392       cmp_status->state++;                                              \
3393   } while (0)
3394
3395
3396 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3397
3398 static void
3399 decode_coding_iso_2022 (struct coding_system *coding)
3400 {
3401   const unsigned char *src = coding->source + coding->consumed;
3402   const unsigned char *src_end = coding->source + coding->src_bytes;
3403   const unsigned char *src_base;
3404   int *charbuf = coding->charbuf + coding->charbuf_used;
3405   /* We may produce two annotations (charset and composition) in one
3406      loop and one more charset annotation at the end.  */
3407   int *charbuf_end
3408     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3409   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3410   bool multibytep = coding->src_multibyte;
3411   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3412   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3413   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3414   int charset_id_2, charset_id_3;
3415   struct charset *charset;
3416   int c;
3417   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3418   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3419   ptrdiff_t char_offset = coding->produced_char;
3420   ptrdiff_t last_offset = char_offset;
3421   int last_id = charset_ascii;
3422   bool eol_dos
3423     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3424   int byte_after_cr = -1;
3425   int i;
3426
3427   setup_iso_safe_charsets (attrs);
3428   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3429
3430   if (cmp_status->state != COMPOSING_NO)
3431     {
3432       if (charbuf_end - charbuf < cmp_status->length)
3433         emacs_abort ();
3434       for (i = 0; i < cmp_status->length; i++)
3435         *charbuf++ = cmp_status->carryover[i];
3436       coding->annotated = 1;
3437     }
3438
3439   while (1)
3440     {
3441       int c1, c2, c3;
3442
3443       src_base = src;
3444       consumed_chars_base = consumed_chars;
3445
3446       if (charbuf >= charbuf_end)
3447         {
3448           if (byte_after_cr >= 0)
3449             src_base--;
3450           break;
3451         }
3452
3453       if (byte_after_cr >= 0)
3454         c1 = byte_after_cr, byte_after_cr = -1;
3455       else
3456         ONE_MORE_BYTE (c1);
3457       if (c1 < 0)
3458         goto invalid_code;
3459
3460       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3461         {
3462           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3463           char_offset++;
3464           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3465           continue;
3466         }
3467
3468       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3469         {
3470           if (c1 == ISO_CODE_ESC)
3471             {
3472               if (src + 1 >= src_end)
3473                 goto no_more_source;
3474               *charbuf++ = ISO_CODE_ESC;
3475               char_offset++;
3476               if (src[0] == '%' && src[1] == '@')
3477                 {
3478                   src += 2;
3479                   consumed_chars += 2;
3480                   char_offset += 2;
3481                   /* We are sure charbuf can contain two more chars. */
3482                   *charbuf++ = '%';
3483                   *charbuf++ = '@';
3484                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3485                 }
3486             }
3487           else
3488             {
3489               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3490               char_offset++;
3491             }
3492           continue;
3493         }
3494
3495       if ((cmp_status->state == COMPOSING_RULE
3496            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3497           && c1 != ISO_CODE_ESC)
3498         {
3499           int rule;
3500
3501           DECODE_COMPOSITION_RULE (rule);
3502           STORE_COMPOSITION_RULE (rule);
3503           continue;
3504         }
3505
3506       /* We produce at most one character.  */
3507       switch (iso_code_class [c1])
3508         {
3509         case ISO_0x20_or_0x7F:
3510           if (charset_id_0 < 0
3511               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3512             /* This is SPACE or DEL.  */
3513             charset = CHARSET_FROM_ID (charset_ascii);
3514           else
3515             charset = CHARSET_FROM_ID (charset_id_0);
3516           break;
3517
3518         case ISO_graphic_plane_0:
3519           if (charset_id_0 < 0)
3520             charset = CHARSET_FROM_ID (charset_ascii);
3521           else
3522             charset = CHARSET_FROM_ID (charset_id_0);
3523           break;
3524
3525         case ISO_0xA0_or_0xFF:
3526           if (charset_id_1 < 0
3527               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3528               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3529             goto invalid_code;
3530           /* This is a graphic character, we fall down ... */
3531
3532         case ISO_graphic_plane_1:
3533           if (charset_id_1 < 0)
3534             goto invalid_code;
3535           charset = CHARSET_FROM_ID (charset_id_1);
3536           break;
3537
3538         case ISO_control_0:
3539           if (eol_dos && c1 == '\r')
3540             ONE_MORE_BYTE (byte_after_cr);
3541           MAYBE_FINISH_COMPOSITION ();
3542           charset = CHARSET_FROM_ID (charset_ascii);
3543           break;
3544
3545         case ISO_control_1:
3546           goto invalid_code;
3547
3548         case ISO_shift_out:
3549           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3550               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3551             goto invalid_code;
3552           CODING_ISO_INVOCATION (coding, 0) = 1;
3553           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3554           continue;
3555
3556         case ISO_shift_in:
3557           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3558             goto invalid_code;
3559           CODING_ISO_INVOCATION (coding, 0) = 0;
3560           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3561           continue;
3562
3563         case ISO_single_shift_2_7:
3564           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3565             goto invalid_code;
3566         case ISO_single_shift_2:
3567           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3568             goto invalid_code;
3569           /* SS2 is handled as an escape sequence of ESC 'N' */
3570           c1 = 'N';
3571           goto label_escape_sequence;
3572
3573         case ISO_single_shift_3:
3574           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3575             goto invalid_code;
3576           /* SS2 is handled as an escape sequence of ESC 'O' */
3577           c1 = 'O';
3578           goto label_escape_sequence;
3579
3580         case ISO_control_sequence_introducer:
3581           /* CSI is handled as an escape sequence of ESC '[' ...  */
3582           c1 = '[';
3583           goto label_escape_sequence;
3584
3585         case ISO_escape:
3586           ONE_MORE_BYTE (c1);
3587         label_escape_sequence:
3588           /* Escape sequences handled here are invocation,
3589              designation, direction specification, and character
3590              composition specification.  */
3591           switch (c1)
3592             {
3593             case '&':           /* revision of following character set */
3594               ONE_MORE_BYTE (c1);
3595               if (!(c1 >= '@' && c1 <= '~'))
3596                 goto invalid_code;
3597               ONE_MORE_BYTE (c1);
3598               if (c1 != ISO_CODE_ESC)
3599                 goto invalid_code;
3600               ONE_MORE_BYTE (c1);
3601               goto label_escape_sequence;
3602
3603             case '$':           /* designation of 2-byte character set */
3604               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3605                 goto invalid_code;
3606               {
3607                 int reg, chars96;
3608
3609                 ONE_MORE_BYTE (c1);
3610                 if (c1 >= '@' && c1 <= 'B')
3611                   {     /* designation of JISX0208.1978, GB2312.1980,
3612                            or JISX0208.1980 */
3613                     reg = 0, chars96 = 0;
3614                   }
3615                 else if (c1 >= 0x28 && c1 <= 0x2B)
3616                   { /* designation of DIMENSION2_CHARS94 character set */
3617                     reg = c1 - 0x28, chars96 = 0;
3618                     ONE_MORE_BYTE (c1);
3619                   }
3620                 else if (c1 >= 0x2C && c1 <= 0x2F)
3621                   { /* designation of DIMENSION2_CHARS96 character set */
3622                     reg = c1 - 0x2C, chars96 = 1;
3623                     ONE_MORE_BYTE (c1);
3624                   }
3625                 else
3626                   goto invalid_code;
3627                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3628                 /* We must update these variables now.  */
3629                 if (reg == 0)
3630                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3631                 else if (reg == 1)
3632                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3633                 if (chars96 < 0)
3634                   goto invalid_code;
3635               }
3636               continue;
3637
3638             case 'n':           /* invocation of locking-shift-2 */
3639               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3640                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3641                 goto invalid_code;
3642               CODING_ISO_INVOCATION (coding, 0) = 2;
3643               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644               continue;
3645
3646             case 'o':           /* invocation of locking-shift-3 */
3647               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3648                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3649                 goto invalid_code;
3650               CODING_ISO_INVOCATION (coding, 0) = 3;
3651               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3652               continue;
3653
3654             case 'N':           /* invocation of single-shift-2 */
3655               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3656                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3657                 goto invalid_code;
3658               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3659               if (charset_id_2 < 0)
3660                 charset = CHARSET_FROM_ID (charset_ascii);
3661               else
3662                 charset = CHARSET_FROM_ID (charset_id_2);
3663               ONE_MORE_BYTE (c1);
3664               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3665                 goto invalid_code;
3666               break;
3667
3668             case 'O':           /* invocation of single-shift-3 */
3669               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3670                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3671                 goto invalid_code;
3672               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3673               if (charset_id_3 < 0)
3674                 charset = CHARSET_FROM_ID (charset_ascii);
3675               else
3676                 charset = CHARSET_FROM_ID (charset_id_3);
3677               ONE_MORE_BYTE (c1);
3678               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3679                 goto invalid_code;
3680               break;
3681
3682             case '0': case '2': case '3': case '4': /* start composition */
3683               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3684                 goto invalid_code;
3685               if (last_id != charset_ascii)
3686                 {
3687                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3688                   last_id = charset_ascii;
3689                   last_offset = char_offset;
3690                 }
3691               DECODE_COMPOSITION_START (c1);
3692               continue;
3693
3694             case '1':           /* end composition */
3695               if (cmp_status->state == COMPOSING_NO)
3696                 goto invalid_code;
3697               DECODE_COMPOSITION_END ();
3698               continue;
3699
3700             case '[':           /* specification of direction */
3701               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3702                 goto invalid_code;
3703               /* For the moment, nested direction is not supported.
3704                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3705                  left-to-right, and nonzero means right-to-left.  */
3706               ONE_MORE_BYTE (c1);
3707               switch (c1)
3708                 {
3709                 case ']':       /* end of the current direction */
3710                   coding->mode &= ~CODING_MODE_DIRECTION;
3711
3712                 case '0':       /* end of the current direction */
3713                 case '1':       /* start of left-to-right direction */
3714                   ONE_MORE_BYTE (c1);
3715                   if (c1 == ']')
3716                     coding->mode &= ~CODING_MODE_DIRECTION;
3717                   else
3718                     goto invalid_code;
3719                   break;
3720
3721                 case '2':       /* start of right-to-left direction */
3722                   ONE_MORE_BYTE (c1);
3723                   if (c1 == ']')
3724                     coding->mode |= CODING_MODE_DIRECTION;
3725                   else
3726                     goto invalid_code;
3727                   break;
3728
3729                 default:
3730                   goto invalid_code;
3731                 }
3732               continue;
3733
3734             case '%':
3735               ONE_MORE_BYTE (c1);
3736               if (c1 == '/')
3737                 {
3738                   /* CTEXT extended segment:
3739                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3740                      We keep these bytes as is for the moment.
3741                      They may be decoded by post-read-conversion.  */
3742                   int dim, M, L;
3743                   int size;
3744
3745                   ONE_MORE_BYTE (dim);
3746                   if (dim < '0' || dim > '4')
3747                     goto invalid_code;
3748                   ONE_MORE_BYTE (M);
3749                   if (M < 128)
3750                     goto invalid_code;
3751                   ONE_MORE_BYTE (L);
3752                   if (L < 128)
3753                     goto invalid_code;
3754                   size = ((M - 128) * 128) + (L - 128);
3755                   if (charbuf + 6 > charbuf_end)
3756                     goto break_loop;
3757                   *charbuf++ = ISO_CODE_ESC;
3758                   *charbuf++ = '%';
3759                   *charbuf++ = '/';
3760                   *charbuf++ = dim;
3761                   *charbuf++ = BYTE8_TO_CHAR (M);
3762                   *charbuf++ = BYTE8_TO_CHAR (L);
3763                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3764                 }
3765               else if (c1 == 'G')
3766                 {
3767                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3768                      ESC % G --UTF-8-BYTES-- ESC % @
3769                      We keep these bytes as is for the moment.
3770                      They may be decoded by post-read-conversion.  */
3771                   if (charbuf + 3 > charbuf_end)
3772                     goto break_loop;
3773                   *charbuf++ = ISO_CODE_ESC;
3774                   *charbuf++ = '%';
3775                   *charbuf++ = 'G';
3776                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3777                 }
3778               else
3779                 goto invalid_code;
3780               continue;
3781               break;
3782
3783             default:
3784               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3785                 goto invalid_code;
3786               {
3787                 int reg, chars96;
3788
3789                 if (c1 >= 0x28 && c1 <= 0x2B)
3790                   { /* designation of DIMENSION1_CHARS94 character set */
3791                     reg = c1 - 0x28, chars96 = 0;
3792                     ONE_MORE_BYTE (c1);
3793                   }
3794                 else if (c1 >= 0x2C && c1 <= 0x2F)
3795                   { /* designation of DIMENSION1_CHARS96 character set */
3796                     reg = c1 - 0x2C, chars96 = 1;
3797                     ONE_MORE_BYTE (c1);
3798                   }
3799                 else
3800                   goto invalid_code;
3801                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3802                 /* We must update these variables now.  */
3803                 if (reg == 0)
3804                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3805                 else if (reg == 1)
3806                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3807                 if (chars96 < 0)
3808                   goto invalid_code;
3809               }
3810               continue;
3811             }
3812           break;
3813
3814         default:
3815           emacs_abort ();
3816         }
3817
3818       if (cmp_status->state == COMPOSING_NO
3819           && charset->id != charset_ascii
3820           && last_id != charset->id)
3821         {
3822           if (last_id != charset_ascii)
3823             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3824           last_id = charset->id;
3825           last_offset = char_offset;
3826         }
3827
3828       /* Now we know CHARSET and 1st position code C1 of a character.
3829          Produce a decoded character while getting 2nd and 3rd
3830          position codes C2, C3 if necessary.  */
3831       if (CHARSET_DIMENSION (charset) > 1)
3832         {
3833           ONE_MORE_BYTE (c2);
3834           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3835               || ((c1 & 0x80) != (c2 & 0x80)))
3836             /* C2 is not in a valid range.  */
3837             goto invalid_code;
3838           if (CHARSET_DIMENSION (charset) == 2)
3839             c1 = (c1 << 8) | c2;
3840           else
3841             {
3842               ONE_MORE_BYTE (c3);
3843               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3844                   || ((c1 & 0x80) != (c3 & 0x80)))
3845                 /* C3 is not in a valid range.  */
3846                 goto invalid_code;
3847               c1 = (c1 << 16) | (c2 << 8) | c2;
3848             }
3849         }
3850       c1 &= 0x7F7F7F;
3851       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3852       if (c < 0)
3853         {
3854           MAYBE_FINISH_COMPOSITION ();
3855           for (; src_base < src; src_base++, char_offset++)
3856             {
3857               if (ASCII_BYTE_P (*src_base))
3858                 *charbuf++ = *src_base;
3859               else
3860                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3861             }
3862         }
3863       else if (cmp_status->state == COMPOSING_NO)
3864         {
3865           *charbuf++ = c;
3866           char_offset++;
3867         }
3868       else if ((cmp_status->state == COMPOSING_CHAR
3869                 ? cmp_status->nchars
3870                 : cmp_status->ncomps)
3871                >= MAX_COMPOSITION_COMPONENTS)
3872         {
3873           /* Too long composition.  */
3874           MAYBE_FINISH_COMPOSITION ();
3875           *charbuf++ = c;
3876           char_offset++;
3877         }
3878       else
3879         STORE_COMPOSITION_CHAR (c);
3880       continue;
3881
3882     invalid_code:
3883       MAYBE_FINISH_COMPOSITION ();
3884       src = src_base;
3885       consumed_chars = consumed_chars_base;
3886       ONE_MORE_BYTE (c);
3887       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3888       char_offset++;
3889       coding->errors++;
3890       continue;
3891
3892     break_loop:
3893       break;
3894     }
3895
3896  no_more_source:
3897   if (cmp_status->state != COMPOSING_NO)
3898     {
3899       if (coding->mode & CODING_MODE_LAST_BLOCK)
3900         MAYBE_FINISH_COMPOSITION ();
3901       else
3902         {
3903           charbuf -= cmp_status->length;
3904           for (i = 0; i < cmp_status->length; i++)
3905             cmp_status->carryover[i] = charbuf[i];
3906         }
3907     }
3908   else if (last_id != charset_ascii)
3909     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3910   coding->consumed_char += consumed_chars_base;
3911   coding->consumed = src_base - coding->source;
3912   coding->charbuf_used = charbuf - coding->charbuf;
3913 }
3914
3915
3916 /* ISO2022 encoding stuff.  */
3917
3918 /*
3919    It is not enough to say just "ISO2022" on encoding, we have to
3920    specify more details.  In Emacs, each coding system of ISO2022
3921    variant has the following specifications:
3922         1. Initial designation to G0 thru G3.
3923         2. Allows short-form designation?
3924         3. ASCII should be designated to G0 before control characters?
3925         4. ASCII should be designated to G0 at end of line?
3926         5. 7-bit environment or 8-bit environment?
3927         6. Use locking-shift?
3928         7. Use Single-shift?
3929    And the following two are only for Japanese:
3930         8. Use ASCII in place of JIS0201-1976-Roman?
3931         9. Use JISX0208-1983 in place of JISX0208-1978?
3932    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3933    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3934    details.
3935 */
3936
3937 /* Produce codes (escape sequence) for designating CHARSET to graphic
3938    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3939    '@', 'A', or 'B' and the coding system CODING allows, produce
3940    designation sequence of short-form.  */
3941
3942 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3943   do {                                                                  \
3944     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3945     const char *intermediate_char_94 = "()*+";                          \
3946     const char *intermediate_char_96 = ",-./";                          \
3947     int revision = -1;                                                  \
3948                                                                         \
3949     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3950       revision = CHARSET_ISO_REVISION (charset);                        \
3951                                                                         \
3952     if (revision >= 0)                                                  \
3953       {                                                                 \
3954         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3955         EMIT_ONE_BYTE ('@' + revision);                                 \
3956       }                                                                 \
3957     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3958     if (CHARSET_DIMENSION (charset) == 1)                               \
3959       {                                                                 \
3960         int b;                                                          \
3961         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3962           b = intermediate_char_94[reg];                                \
3963         else                                                            \
3964           b = intermediate_char_96[reg];                                \
3965         EMIT_ONE_ASCII_BYTE (b);                                        \
3966       }                                                                 \
3967     else                                                                \
3968       {                                                                 \
3969         EMIT_ONE_ASCII_BYTE ('$');                                      \
3970         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3971           {                                                             \
3972             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3973                 || reg != 0                                             \
3974                 || final_char < '@' || final_char > 'B')                \
3975               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3976           }                                                             \
3977         else                                                            \
3978           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3979       }                                                                 \
3980     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3981                                                                         \
3982     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3983   } while (0)
3984
3985
3986 /* The following two macros produce codes (control character or escape
3987    sequence) for ISO2022 single-shift functions (single-shift-2 and
3988    single-shift-3).  */
3989
3990 #define ENCODE_SINGLE_SHIFT_2                                           \
3991   do {                                                                  \
3992     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3993       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3994     else                                                                \
3995       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3996     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3997   } while (0)
3998
3999
4000 #define ENCODE_SINGLE_SHIFT_3                                           \
4001   do {                                                                  \
4002     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4003       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4004     else                                                                \
4005       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4006     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4007   } while (0)
4008
4009
4010 /* The following four macros produce codes (control character or
4011    escape sequence) for ISO2022 locking-shift functions (shift-in,
4012    shift-out, locking-shift-2, and locking-shift-3).  */
4013
4014 #define ENCODE_SHIFT_IN                                 \
4015   do {                                                  \
4016     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4017     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4018   } while (0)
4019
4020
4021 #define ENCODE_SHIFT_OUT                                \
4022   do {                                                  \
4023     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4024     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4025   } while (0)
4026
4027
4028 #define ENCODE_LOCKING_SHIFT_2                          \
4029   do {                                                  \
4030     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4031     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4032   } while (0)
4033
4034
4035 #define ENCODE_LOCKING_SHIFT_3                          \
4036   do {                                                  \
4037     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4038     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4039   } while (0)
4040
4041
4042 /* Produce codes for a DIMENSION1 character whose character set is
4043    CHARSET and whose position-code is C1.  Designation and invocation
4044    sequences are also produced in advance if necessary.  */
4045
4046 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4047   do {                                                                  \
4048     int id = CHARSET_ID (charset);                                      \
4049                                                                         \
4050     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4051         && id == charset_ascii)                                         \
4052       {                                                                 \
4053         id = charset_jisx0201_roman;                                    \
4054         charset = CHARSET_FROM_ID (id);                                 \
4055       }                                                                 \
4056                                                                         \
4057     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4058       {                                                                 \
4059         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4060           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4061         else                                                            \
4062           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4063         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4064         break;                                                          \
4065       }                                                                 \
4066     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4067       {                                                                 \
4068         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4069         break;                                                          \
4070       }                                                                 \
4071     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4072       {                                                                 \
4073         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4074         break;                                                          \
4075       }                                                                 \
4076     else                                                                \
4077       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4078          must invoke it, or, at first, designate it to some graphic     \
4079          register.  Then repeat the loop to actually produce the        \
4080          character.  */                                                 \
4081       dst = encode_invocation_designation (charset, coding, dst,        \
4082                                            &produced_chars);            \
4083   } while (1)
4084
4085
4086 /* Produce codes for a DIMENSION2 character whose character set is
4087    CHARSET and whose position-codes are C1 and C2.  Designation and
4088    invocation codes are also produced in advance if necessary.  */
4089
4090 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4091   do {                                                                  \
4092     int id = CHARSET_ID (charset);                                      \
4093                                                                         \
4094     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4095         && id == charset_jisx0208)                                      \
4096       {                                                                 \
4097         id = charset_jisx0208_1978;                                     \
4098         charset = CHARSET_FROM_ID (id);                                 \
4099       }                                                                 \
4100                                                                         \
4101     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4102       {                                                                 \
4103         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4104           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4105         else                                                            \
4106           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4107         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4108         break;                                                          \
4109       }                                                                 \
4110     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4111       {                                                                 \
4112         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4113         break;                                                          \
4114       }                                                                 \
4115     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4116       {                                                                 \
4117         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4118         break;                                                          \
4119       }                                                                 \
4120     else                                                                \
4121       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4122          must invoke it, or, at first, designate it to some graphic     \
4123          register.  Then repeat the loop to actually produce the        \
4124          character.  */                                                 \
4125       dst = encode_invocation_designation (charset, coding, dst,        \
4126                                            &produced_chars);            \
4127   } while (1)
4128
4129
4130 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4131   do {                                                                     \
4132     unsigned code;                                                         \
4133     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4134                                                                            \
4135     if (CHARSET_DIMENSION (charset) == 1)                                  \
4136       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4137     else                                                                   \
4138       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4139   } while (0)
4140
4141
4142 /* Produce designation and invocation codes at a place pointed by DST
4143    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4144    Return new DST.  */
4145
4146 static unsigned char *
4147 encode_invocation_designation (struct charset *charset,
4148                                struct coding_system *coding,
4149                                unsigned char *dst, ptrdiff_t *p_nchars)
4150 {
4151   bool multibytep = coding->dst_multibyte;
4152   ptrdiff_t produced_chars = *p_nchars;
4153   int reg;                      /* graphic register number */
4154   int id = CHARSET_ID (charset);
4155
4156   /* At first, check designations.  */
4157   for (reg = 0; reg < 4; reg++)
4158     if (id == CODING_ISO_DESIGNATION (coding, reg))
4159       break;
4160
4161   if (reg >= 4)
4162     {
4163       /* CHARSET is not yet designated to any graphic registers.  */
4164       /* At first check the requested designation.  */
4165       reg = CODING_ISO_REQUEST (coding, id);
4166       if (reg < 0)
4167         /* Since CHARSET requests no special designation, designate it
4168            to graphic register 0.  */
4169         reg = 0;
4170
4171       ENCODE_DESIGNATION (charset, reg, coding);
4172     }
4173
4174   if (CODING_ISO_INVOCATION (coding, 0) != reg
4175       && CODING_ISO_INVOCATION (coding, 1) != reg)
4176     {
4177       /* Since the graphic register REG is not invoked to any graphic
4178          planes, invoke it to graphic plane 0.  */
4179       switch (reg)
4180         {
4181         case 0:                 /* graphic register 0 */
4182           ENCODE_SHIFT_IN;
4183           break;
4184
4185         case 1:                 /* graphic register 1 */
4186           ENCODE_SHIFT_OUT;
4187           break;
4188
4189         case 2:                 /* graphic register 2 */
4190           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4191             ENCODE_SINGLE_SHIFT_2;
4192           else
4193             ENCODE_LOCKING_SHIFT_2;
4194           break;
4195
4196         case 3:                 /* graphic register 3 */
4197           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4198             ENCODE_SINGLE_SHIFT_3;
4199           else
4200             ENCODE_LOCKING_SHIFT_3;
4201           break;
4202         }
4203     }
4204
4205   *p_nchars = produced_chars;
4206   return dst;
4207 }
4208
4209
4210 /* Produce codes for designation and invocation to reset the graphic
4211    planes and registers to initial state.  */
4212 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4213   do {                                                                  \
4214     int reg;                                                            \
4215     struct charset *charset;                                            \
4216                                                                         \
4217     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4218       ENCODE_SHIFT_IN;                                                  \
4219     for (reg = 0; reg < 4; reg++)                                       \
4220       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4221           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4222               != CODING_ISO_INITIAL (coding, reg)))                     \
4223         {                                                               \
4224           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4225           ENCODE_DESIGNATION (charset, reg, coding);                    \
4226         }                                                               \
4227   } while (0)
4228
4229
4230 /* Produce designation sequences of charsets in the line started from
4231    CHARBUF to a place pointed by DST, and return the number of
4232    produced bytes.  DST should not directly point a buffer text area
4233    which may be relocated by char_charset call.
4234
4235    If the current block ends before any end-of-line, we may fail to
4236    find all the necessary designations.  */
4237
4238 static ptrdiff_t
4239 encode_designation_at_bol (struct coding_system *coding,
4240                            int *charbuf, int *charbuf_end,
4241                            unsigned char *dst)
4242 {
4243   unsigned char *orig = dst;
4244   struct charset *charset;
4245   /* Table of charsets to be designated to each graphic register.  */
4246   int r[4];
4247   int c, found = 0, reg;
4248   ptrdiff_t produced_chars = 0;
4249   bool multibytep = coding->dst_multibyte;
4250   Lisp_Object attrs;
4251   Lisp_Object charset_list;
4252
4253   attrs = CODING_ID_ATTRS (coding->id);
4254   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4255   if (EQ (charset_list, Qiso_2022))
4256     charset_list = Viso_2022_charset_list;
4257
4258   for (reg = 0; reg < 4; reg++)
4259     r[reg] = -1;
4260
4261   while (charbuf < charbuf_end && found < 4)
4262     {
4263       int id;
4264
4265       c = *charbuf++;
4266       if (c == '\n')
4267         break;
4268       charset = char_charset (c, charset_list, NULL);
4269       id = CHARSET_ID (charset);
4270       reg = CODING_ISO_REQUEST (coding, id);
4271       if (reg >= 0 && r[reg] < 0)
4272         {
4273           found++;
4274           r[reg] = id;
4275         }
4276     }
4277
4278   if (found)
4279     {
4280       for (reg = 0; reg < 4; reg++)
4281         if (r[reg] >= 0
4282             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4283           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4284     }
4285
4286   return dst - orig;
4287 }
4288
4289 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4290
4291 static bool
4292 encode_coding_iso_2022 (struct coding_system *coding)
4293 {
4294   bool multibytep = coding->dst_multibyte;
4295   int *charbuf = coding->charbuf;
4296   int *charbuf_end = charbuf + coding->charbuf_used;
4297   unsigned char *dst = coding->destination + coding->produced;
4298   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4299   int safe_room = 16;
4300   bool bol_designation
4301     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4302        && CODING_ISO_BOL (coding));
4303   ptrdiff_t produced_chars = 0;
4304   Lisp_Object attrs, eol_type, charset_list;
4305   bool ascii_compatible;
4306   int c;
4307   int preferred_charset_id = -1;
4308
4309   CODING_GET_INFO (coding, attrs, charset_list);
4310   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4311   if (VECTORP (eol_type))
4312     eol_type = Qunix;
4313
4314   setup_iso_safe_charsets (attrs);
4315   /* Charset list may have been changed.  */
4316   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4317   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4318
4319   ascii_compatible
4320     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4321        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4322                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4323
4324   while (charbuf < charbuf_end)
4325     {
4326       ASSURE_DESTINATION (safe_room);
4327
4328       if (bol_designation)
4329         {
4330           /* We have to produce designation sequences if any now.  */
4331           unsigned char desig_buf[16];
4332           int nbytes;
4333           ptrdiff_t offset;
4334
4335           charset_map_loaded = 0;
4336           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4337                                               desig_buf);
4338           if (charset_map_loaded
4339               && (offset = coding_change_destination (coding)))
4340             {
4341               dst += offset;
4342               dst_end += offset;
4343             }
4344           memcpy (dst, desig_buf, nbytes);
4345           dst += nbytes;
4346           /* We are sure that designation sequences are all ASCII bytes.  */
4347           produced_chars += nbytes;
4348           bol_designation = 0;
4349           ASSURE_DESTINATION (safe_room);
4350         }
4351
4352       c = *charbuf++;
4353
4354       if (c < 0)
4355         {
4356           /* Handle an annotation.  */
4357           switch (*charbuf)
4358             {
4359             case CODING_ANNOTATE_COMPOSITION_MASK:
4360               /* Not yet implemented.  */
4361               break;
4362             case CODING_ANNOTATE_CHARSET_MASK:
4363               preferred_charset_id = charbuf[2];
4364               if (preferred_charset_id >= 0
4365                   && NILP (Fmemq (make_number (preferred_charset_id),
4366                                   charset_list)))
4367                 preferred_charset_id = -1;
4368               break;
4369             default:
4370               emacs_abort ();
4371             }
4372           charbuf += -c - 1;
4373           continue;
4374         }
4375
4376       /* Now encode the character C.  */
4377       if (c < 0x20 || c == 0x7F)
4378         {
4379           if (c == '\n'
4380               || (c == '\r' && EQ (eol_type, Qmac)))
4381             {
4382               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4383                 ENCODE_RESET_PLANE_AND_REGISTER ();
4384               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4385                 {
4386                   int i;
4387
4388                   for (i = 0; i < 4; i++)
4389                     CODING_ISO_DESIGNATION (coding, i)
4390                       = CODING_ISO_INITIAL (coding, i);
4391                 }
4392               bol_designation = ((CODING_ISO_FLAGS (coding)
4393                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4394                                  != 0);
4395             }
4396           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4397             ENCODE_RESET_PLANE_AND_REGISTER ();
4398           EMIT_ONE_ASCII_BYTE (c);
4399         }
4400       else if (ASCII_CHAR_P (c))
4401         {
4402           if (ascii_compatible)
4403             EMIT_ONE_ASCII_BYTE (c);
4404           else
4405             {
4406               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4407               ENCODE_ISO_CHARACTER (charset, c);
4408             }
4409         }
4410       else if (CHAR_BYTE8_P (c))
4411         {
4412           c = CHAR_TO_BYTE8 (c);
4413           EMIT_ONE_BYTE (c);
4414         }
4415       else
4416         {
4417           struct charset *charset;
4418
4419           if (preferred_charset_id >= 0)
4420             {
4421               bool result;
4422
4423               charset = CHARSET_FROM_ID (preferred_charset_id);
4424               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4425               if (! result)
4426                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4427                                      NULL, charset);
4428             }
4429           else
4430             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4431                                  NULL, charset);
4432           if (!charset)
4433             {
4434               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4435                 {
4436                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4437                   charset = CHARSET_FROM_ID (charset_ascii);
4438                 }
4439               else
4440                 {
4441                   c = coding->default_char;
4442                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4443                                        charset_list, NULL, charset);
4444                 }
4445             }
4446           ENCODE_ISO_CHARACTER (charset, c);
4447         }
4448     }
4449
4450   if (coding->mode & CODING_MODE_LAST_BLOCK
4451       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4452     {
4453       ASSURE_DESTINATION (safe_room);
4454       ENCODE_RESET_PLANE_AND_REGISTER ();
4455     }
4456   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4457   CODING_ISO_BOL (coding) = bol_designation;
4458   coding->produced_char += produced_chars;
4459   coding->produced = dst - coding->destination;
4460   return 0;
4461 }
4462
4463 \f
4464 /*** 8,9. SJIS and BIG5 handlers ***/
4465
4466 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4467    quite widely.  So, for the moment, Emacs supports them in the bare
4468    C code.  But, in the future, they may be supported only by CCL.  */
4469
4470 /* SJIS is a coding system encoding three character sets: ASCII, right
4471    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4472    as is.  A character of charset katakana-jisx0201 is encoded by
4473    "position-code + 0x80".  A character of charset japanese-jisx0208
4474    is encoded in 2-byte but two position-codes are divided and shifted
4475    so that it fit in the range below.
4476
4477    --- CODE RANGE of SJIS ---
4478    (character set)      (range)
4479    ASCII                0x00 .. 0x7F
4480    KATAKANA-JISX0201    0xA0 .. 0xDF
4481    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4482             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4483    -------------------------------
4484
4485 */
4486
4487 /* BIG5 is a coding system encoding two character sets: ASCII and
4488    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4489    character set and is encoded in two-byte.
4490
4491    --- CODE RANGE of BIG5 ---
4492    (character set)      (range)
4493    ASCII                0x00 .. 0x7F
4494    Big5 (1st byte)      0xA1 .. 0xFE
4495         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4496    --------------------------
4497
4498   */
4499
4500 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4501    Return true if a text is encoded in SJIS.  */
4502
4503 static bool
4504 detect_coding_sjis (struct coding_system *coding,
4505                     struct coding_detection_info *detect_info)
4506 {
4507   const unsigned char *src = coding->source, *src_base;
4508   const unsigned char *src_end = coding->source + coding->src_bytes;
4509   bool multibytep = coding->src_multibyte;
4510   ptrdiff_t consumed_chars = 0;
4511   int found = 0;
4512   int c;
4513   Lisp_Object attrs, charset_list;
4514   int max_first_byte_of_2_byte_code;
4515
4516   CODING_GET_INFO (coding, attrs, charset_list);
4517   max_first_byte_of_2_byte_code
4518     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4519
4520   detect_info->checked |= CATEGORY_MASK_SJIS;
4521   /* A coding system of this category is always ASCII compatible.  */
4522   src += coding->head_ascii;
4523
4524   while (1)
4525     {
4526       src_base = src;
4527       ONE_MORE_BYTE (c);
4528       if (c < 0x80)
4529         continue;
4530       if ((c >= 0x81 && c <= 0x9F)
4531           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4532         {
4533           ONE_MORE_BYTE (c);
4534           if (c < 0x40 || c == 0x7F || c > 0xFC)
4535             break;
4536           found = CATEGORY_MASK_SJIS;
4537         }
4538       else if (c >= 0xA0 && c < 0xE0)
4539         found = CATEGORY_MASK_SJIS;
4540       else
4541         break;
4542     }
4543   detect_info->rejected |= CATEGORY_MASK_SJIS;
4544   return 0;
4545
4546  no_more_source:
4547   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4548     {
4549       detect_info->rejected |= CATEGORY_MASK_SJIS;
4550       return 0;
4551     }
4552   detect_info->found |= found;
4553   return 1;
4554 }
4555
4556 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4557    Return true if a text is encoded in BIG5.  */
4558
4559 static bool
4560 detect_coding_big5 (struct coding_system *coding,
4561                     struct coding_detection_info *detect_info)
4562 {
4563   const unsigned char *src = coding->source, *src_base;
4564   const unsigned char *src_end = coding->source + coding->src_bytes;
4565   bool multibytep = coding->src_multibyte;
4566   ptrdiff_t consumed_chars = 0;
4567   int found = 0;
4568   int c;
4569
4570   detect_info->checked |= CATEGORY_MASK_BIG5;
4571   /* A coding system of this category is always ASCII compatible.  */
4572   src += coding->head_ascii;
4573
4574   while (1)
4575     {
4576       src_base = src;
4577       ONE_MORE_BYTE (c);
4578       if (c < 0x80)
4579         continue;
4580       if (c >= 0xA1)
4581         {
4582           ONE_MORE_BYTE (c);
4583           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4584             return 0;
4585           found = CATEGORY_MASK_BIG5;
4586         }
4587       else
4588         break;
4589     }
4590   detect_info->rejected |= CATEGORY_MASK_BIG5;
4591   return 0;
4592
4593  no_more_source:
4594   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4595     {
4596       detect_info->rejected |= CATEGORY_MASK_BIG5;
4597       return 0;
4598     }
4599   detect_info->found |= found;
4600   return 1;
4601 }
4602
4603 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4604
4605 static void
4606 decode_coding_sjis (struct coding_system *coding)
4607 {
4608   const unsigned char *src = coding->source + coding->consumed;
4609   const unsigned char *src_end = coding->source + coding->src_bytes;
4610   const unsigned char *src_base;
4611   int *charbuf = coding->charbuf + coding->charbuf_used;
4612   /* We may produce one charset annotation in one loop and one more at
4613      the end.  */
4614   int *charbuf_end
4615     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4616   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4617   bool multibytep = coding->src_multibyte;
4618   struct charset *charset_roman, *charset_kanji, *charset_kana;
4619   struct charset *charset_kanji2;
4620   Lisp_Object attrs, charset_list, val;
4621   ptrdiff_t char_offset = coding->produced_char;
4622   ptrdiff_t last_offset = char_offset;
4623   int last_id = charset_ascii;
4624   bool eol_dos
4625     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4626   int byte_after_cr = -1;
4627
4628   CODING_GET_INFO (coding, attrs, charset_list);
4629
4630   val = charset_list;
4631   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4632   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4633   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4634   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4635
4636   while (1)
4637     {
4638       int c, c1;
4639       struct charset *charset;
4640
4641       src_base = src;
4642       consumed_chars_base = consumed_chars;
4643
4644       if (charbuf >= charbuf_end)
4645         {
4646           if (byte_after_cr >= 0)
4647             src_base--;
4648           break;
4649         }
4650
4651       if (byte_after_cr >= 0)
4652         c = byte_after_cr, byte_after_cr = -1;
4653       else
4654         ONE_MORE_BYTE (c);
4655       if (c < 0)
4656         goto invalid_code;
4657       if (c < 0x80)
4658         {
4659           if (eol_dos && c == '\r')
4660             ONE_MORE_BYTE (byte_after_cr);
4661           charset = charset_roman;
4662         }
4663       else if (c == 0x80 || c == 0xA0)
4664         goto invalid_code;
4665       else if (c >= 0xA1 && c <= 0xDF)
4666         {
4667           /* SJIS -> JISX0201-Kana */
4668           c &= 0x7F;
4669           charset = charset_kana;
4670         }
4671       else if (c <= 0xEF)
4672         {
4673           /* SJIS -> JISX0208 */
4674           ONE_MORE_BYTE (c1);
4675           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4676             goto invalid_code;
4677           c = (c << 8) | c1;
4678           SJIS_TO_JIS (c);
4679           charset = charset_kanji;
4680         }
4681       else if (c <= 0xFC && charset_kanji2)
4682         {
4683           /* SJIS -> JISX0213-2 */
4684           ONE_MORE_BYTE (c1);
4685           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4686             goto invalid_code;
4687           c = (c << 8) | c1;
4688           SJIS_TO_JIS2 (c);
4689           charset = charset_kanji2;
4690         }
4691       else
4692         goto invalid_code;
4693       if (charset->id != charset_ascii
4694           && last_id != charset->id)
4695         {
4696           if (last_id != charset_ascii)
4697             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4698           last_id = charset->id;
4699           last_offset = char_offset;
4700         }
4701       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4702       *charbuf++ = c;
4703       char_offset++;
4704       continue;
4705
4706     invalid_code:
4707       src = src_base;
4708       consumed_chars = consumed_chars_base;
4709       ONE_MORE_BYTE (c);
4710       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4711       char_offset++;
4712       coding->errors++;
4713     }
4714
4715  no_more_source:
4716   if (last_id != charset_ascii)
4717     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4718   coding->consumed_char += consumed_chars_base;
4719   coding->consumed = src_base - coding->source;
4720   coding->charbuf_used = charbuf - coding->charbuf;
4721 }
4722
4723 static void
4724 decode_coding_big5 (struct coding_system *coding)
4725 {
4726   const unsigned char *src = coding->source + coding->consumed;
4727   const unsigned char *src_end = coding->source + coding->src_bytes;
4728   const unsigned char *src_base;
4729   int *charbuf = coding->charbuf + coding->charbuf_used;
4730   /* We may produce one charset annotation in one loop and one more at
4731      the end.  */
4732   int *charbuf_end
4733     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4734   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4735   bool multibytep = coding->src_multibyte;
4736   struct charset *charset_roman, *charset_big5;
4737   Lisp_Object attrs, charset_list, val;
4738   ptrdiff_t char_offset = coding->produced_char;
4739   ptrdiff_t last_offset = char_offset;
4740   int last_id = charset_ascii;
4741   bool eol_dos
4742     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4743   int byte_after_cr = -1;
4744
4745   CODING_GET_INFO (coding, attrs, charset_list);
4746   val = charset_list;
4747   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4748   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4749
4750   while (1)
4751     {
4752       int c, c1;
4753       struct charset *charset;
4754
4755       src_base = src;
4756       consumed_chars_base = consumed_chars;
4757
4758       if (charbuf >= charbuf_end)
4759         {
4760           if (byte_after_cr >= 0)
4761             src_base--;
4762           break;
4763         }
4764
4765       if (byte_after_cr >= 0)
4766         c = byte_after_cr, byte_after_cr = -1;
4767       else
4768         ONE_MORE_BYTE (c);
4769
4770       if (c < 0)
4771         goto invalid_code;
4772       if (c < 0x80)
4773         {
4774           if (eol_dos && c == '\r')
4775             ONE_MORE_BYTE (byte_after_cr);
4776           charset = charset_roman;
4777         }
4778       else
4779         {
4780           /* BIG5 -> Big5 */
4781           if (c < 0xA1 || c > 0xFE)
4782             goto invalid_code;
4783           ONE_MORE_BYTE (c1);
4784           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4785             goto invalid_code;
4786           c = c << 8 | c1;
4787           charset = charset_big5;
4788         }
4789       if (charset->id != charset_ascii
4790           && last_id != charset->id)
4791         {
4792           if (last_id != charset_ascii)
4793             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4794           last_id = charset->id;
4795           last_offset = char_offset;
4796         }
4797       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4798       *charbuf++ = c;
4799       char_offset++;
4800       continue;
4801
4802     invalid_code:
4803       src = src_base;
4804       consumed_chars = consumed_chars_base;
4805       ONE_MORE_BYTE (c);
4806       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4807       char_offset++;
4808       coding->errors++;
4809     }
4810
4811  no_more_source:
4812   if (last_id != charset_ascii)
4813     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4814   coding->consumed_char += consumed_chars_base;
4815   coding->consumed = src_base - coding->source;
4816   coding->charbuf_used = charbuf - coding->charbuf;
4817 }
4818
4819 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4820    This function can encode charsets `ascii', `katakana-jisx0201',
4821    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4822    are sure that all these charsets are registered as official charset
4823    (i.e. do not have extended leading-codes).  Characters of other
4824    charsets are produced without any encoding.  */
4825
4826 static bool
4827 encode_coding_sjis (struct coding_system *coding)
4828 {
4829   bool multibytep = coding->dst_multibyte;
4830   int *charbuf = coding->charbuf;
4831   int *charbuf_end = charbuf + coding->charbuf_used;
4832   unsigned char *dst = coding->destination + coding->produced;
4833   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4834   int safe_room = 4;
4835   ptrdiff_t produced_chars = 0;
4836   Lisp_Object attrs, charset_list, val;
4837   bool ascii_compatible;
4838   struct charset *charset_kanji, *charset_kana;
4839   struct charset *charset_kanji2;
4840   int c;
4841
4842   CODING_GET_INFO (coding, attrs, charset_list);
4843   val = XCDR (charset_list);
4844   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4845   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4846   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4847
4848   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4849
4850   while (charbuf < charbuf_end)
4851     {
4852       ASSURE_DESTINATION (safe_room);
4853       c = *charbuf++;
4854       /* Now encode the character C.  */
4855       if (ASCII_CHAR_P (c) && ascii_compatible)
4856         EMIT_ONE_ASCII_BYTE (c);
4857       else if (CHAR_BYTE8_P (c))
4858         {
4859           c = CHAR_TO_BYTE8 (c);
4860           EMIT_ONE_BYTE (c);
4861         }
4862       else
4863         {
4864           unsigned code;
4865           struct charset *charset;
4866           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4867                                &code, charset);
4868
4869           if (!charset)
4870             {
4871               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4872                 {
4873                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4874                   charset = CHARSET_FROM_ID (charset_ascii);
4875                 }
4876               else
4877                 {
4878                   c = coding->default_char;
4879                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4880                                        charset_list, &code, charset);
4881                 }
4882             }
4883           if (code == CHARSET_INVALID_CODE (charset))
4884             emacs_abort ();
4885           if (charset == charset_kanji)
4886             {
4887               int c1, c2;
4888               JIS_TO_SJIS (code);
4889               c1 = code >> 8, c2 = code & 0xFF;
4890               EMIT_TWO_BYTES (c1, c2);
4891             }
4892           else if (charset == charset_kana)
4893             EMIT_ONE_BYTE (code | 0x80);
4894           else if (charset_kanji2 && charset == charset_kanji2)
4895             {
4896               int c1, c2;
4897
4898               c1 = code >> 8;
4899               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4900                   || c1 == 0x28
4901                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4902                 {
4903                   JIS_TO_SJIS2 (code);
4904                   c1 = code >> 8, c2 = code & 0xFF;
4905                   EMIT_TWO_BYTES (c1, c2);
4906                 }
4907               else
4908                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4909             }
4910           else
4911             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4912         }
4913     }
4914   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4915   coding->produced_char += produced_chars;
4916   coding->produced = dst - coding->destination;
4917   return 0;
4918 }
4919
4920 static bool
4921 encode_coding_big5 (struct coding_system *coding)
4922 {
4923   bool multibytep = coding->dst_multibyte;
4924   int *charbuf = coding->charbuf;
4925   int *charbuf_end = charbuf + coding->charbuf_used;
4926   unsigned char *dst = coding->destination + coding->produced;
4927   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4928   int safe_room = 4;
4929   ptrdiff_t produced_chars = 0;
4930   Lisp_Object attrs, charset_list, val;
4931   bool ascii_compatible;
4932   struct charset *charset_big5;
4933   int c;
4934
4935   CODING_GET_INFO (coding, attrs, charset_list);
4936   val = XCDR (charset_list);
4937   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4938   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4939
4940   while (charbuf < charbuf_end)
4941     {
4942       ASSURE_DESTINATION (safe_room);
4943       c = *charbuf++;
4944       /* Now encode the character C.  */
4945       if (ASCII_CHAR_P (c) && ascii_compatible)
4946         EMIT_ONE_ASCII_BYTE (c);
4947       else if (CHAR_BYTE8_P (c))
4948         {
4949           c = CHAR_TO_BYTE8 (c);
4950           EMIT_ONE_BYTE (c);
4951         }
4952       else
4953         {
4954           unsigned code;
4955           struct charset *charset;
4956           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4957                                &code, charset);
4958
4959           if (! charset)
4960             {
4961               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4962                 {
4963                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4964                   charset = CHARSET_FROM_ID (charset_ascii);
4965                 }
4966               else
4967                 {
4968                   c = coding->default_char;
4969                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4970                                        charset_list, &code, charset);
4971                 }
4972             }
4973           if (code == CHARSET_INVALID_CODE (charset))
4974             emacs_abort ();
4975           if (charset == charset_big5)
4976             {
4977               int c1, c2;
4978
4979               c1 = code >> 8, c2 = code & 0xFF;
4980               EMIT_TWO_BYTES (c1, c2);
4981             }
4982           else
4983             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4984         }
4985     }
4986   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4987   coding->produced_char += produced_chars;
4988   coding->produced = dst - coding->destination;
4989   return 0;
4990 }
4991
4992 \f
4993 /*** 10. CCL handlers ***/
4994
4995 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4996    Return true if a text is encoded in a coding system of which
4997    encoder/decoder are written in CCL program.  */
4998
4999 static bool
5000 detect_coding_ccl (struct coding_system *coding,
5001                    struct coding_detection_info *detect_info)
5002 {
5003   const unsigned char *src = coding->source, *src_base;
5004   const unsigned char *src_end = coding->source + coding->src_bytes;
5005   bool multibytep = coding->src_multibyte;
5006   ptrdiff_t consumed_chars = 0;
5007   int found = 0;
5008   unsigned char *valids;
5009   ptrdiff_t head_ascii = coding->head_ascii;
5010   Lisp_Object attrs;
5011
5012   detect_info->checked |= CATEGORY_MASK_CCL;
5013
5014   coding = &coding_categories[coding_category_ccl];
5015   valids = CODING_CCL_VALIDS (coding);
5016   attrs = CODING_ID_ATTRS (coding->id);
5017   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5018     src += head_ascii;
5019
5020   while (1)
5021     {
5022       int c;
5023
5024       src_base = src;
5025       ONE_MORE_BYTE (c);
5026       if (c < 0 || ! valids[c])
5027         break;
5028       if ((valids[c] > 1))
5029         found = CATEGORY_MASK_CCL;
5030     }
5031   detect_info->rejected |= CATEGORY_MASK_CCL;
5032   return 0;
5033
5034  no_more_source:
5035   detect_info->found |= found;
5036   return 1;
5037 }
5038
5039 static void
5040 decode_coding_ccl (struct coding_system *coding)
5041 {
5042   const unsigned char *src = coding->source + coding->consumed;
5043   const unsigned char *src_end = coding->source + coding->src_bytes;
5044   int *charbuf = coding->charbuf + coding->charbuf_used;
5045   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5046   ptrdiff_t consumed_chars = 0;
5047   bool multibytep = coding->src_multibyte;
5048   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5049   int source_charbuf[1024];
5050   int source_byteidx[1025];
5051   Lisp_Object attrs, charset_list;
5052
5053   CODING_GET_INFO (coding, attrs, charset_list);
5054
5055   while (1)
5056     {
5057       const unsigned char *p = src;
5058       ptrdiff_t offset;
5059       int i = 0;
5060
5061       if (multibytep)
5062         {
5063           while (i < 1024 && p < src_end)
5064             {
5065               source_byteidx[i] = p - src;
5066               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5067             }
5068           source_byteidx[i] = p - src;
5069         }
5070       else
5071         while (i < 1024 && p < src_end)
5072           source_charbuf[i++] = *p++;
5073
5074       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5075         ccl->last_block = 1;
5076       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5077       charset_map_loaded = 0;
5078       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5079                   charset_list);
5080       if (charset_map_loaded
5081           && (offset = coding_change_source (coding)))
5082         {
5083           p += offset;
5084           src += offset;
5085           src_end += offset;
5086         }
5087       charbuf += ccl->produced;
5088       if (multibytep)
5089         src += source_byteidx[ccl->consumed];
5090       else
5091         src += ccl->consumed;
5092       consumed_chars += ccl->consumed;
5093       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5094         break;
5095     }
5096
5097   switch (ccl->status)
5098     {
5099     case CCL_STAT_SUSPEND_BY_SRC:
5100       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5101       break;
5102     case CCL_STAT_SUSPEND_BY_DST:
5103       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5104       break;
5105     case CCL_STAT_QUIT:
5106     case CCL_STAT_INVALID_CMD:
5107       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5108       break;
5109     default:
5110       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5111       break;
5112     }
5113   coding->consumed_char += consumed_chars;
5114   coding->consumed = src - coding->source;
5115   coding->charbuf_used = charbuf - coding->charbuf;
5116 }
5117
5118 static bool
5119 encode_coding_ccl (struct coding_system *coding)
5120 {
5121   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5122   bool multibytep = coding->dst_multibyte;
5123   int *charbuf = coding->charbuf;
5124   int *charbuf_end = charbuf + coding->charbuf_used;
5125   unsigned char *dst = coding->destination + coding->produced;
5126   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5127   int destination_charbuf[1024];
5128   ptrdiff_t produced_chars = 0;
5129   int i;
5130   Lisp_Object attrs, charset_list;
5131
5132   CODING_GET_INFO (coding, attrs, charset_list);
5133   if (coding->consumed_char == coding->src_chars
5134       && coding->mode & CODING_MODE_LAST_BLOCK)
5135     ccl->last_block = 1;
5136
5137   do
5138     {
5139       ptrdiff_t offset;
5140
5141       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5142       charset_map_loaded = 0;
5143       ccl_driver (ccl, charbuf, destination_charbuf,
5144                   charbuf_end - charbuf, 1024, charset_list);
5145       if (charset_map_loaded
5146           && (offset = coding_change_destination (coding)))
5147         dst += offset;
5148       if (multibytep)
5149         {
5150           ASSURE_DESTINATION (ccl->produced * 2);
5151           for (i = 0; i < ccl->produced; i++)
5152             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5153         }
5154       else
5155         {
5156           ASSURE_DESTINATION (ccl->produced);
5157           for (i = 0; i < ccl->produced; i++)
5158             *dst++ = destination_charbuf[i] & 0xFF;
5159           produced_chars += ccl->produced;
5160         }
5161       charbuf += ccl->consumed;
5162       if (ccl->status == CCL_STAT_QUIT
5163           || ccl->status == CCL_STAT_INVALID_CMD)
5164         break;
5165     }
5166   while (charbuf < charbuf_end);
5167
5168   switch (ccl->status)
5169     {
5170     case CCL_STAT_SUSPEND_BY_SRC:
5171       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5172       break;
5173     case CCL_STAT_SUSPEND_BY_DST:
5174       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5175       break;
5176     case CCL_STAT_QUIT:
5177     case CCL_STAT_INVALID_CMD:
5178       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5179       break;
5180     default:
5181       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5182       break;
5183     }
5184
5185   coding->produced_char += produced_chars;
5186   coding->produced = dst - coding->destination;
5187   return 0;
5188 }
5189
5190 \f
5191 /*** 10, 11. no-conversion handlers ***/
5192
5193 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5194
5195 static void
5196 decode_coding_raw_text (struct coding_system *coding)
5197 {
5198   bool eol_dos
5199     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5200
5201   coding->chars_at_source = 1;
5202   coding->consumed_char = coding->src_chars;
5203   coding->consumed = coding->src_bytes;
5204   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5205     {
5206       coding->consumed_char--;
5207       coding->consumed--;
5208       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5209     }
5210   else
5211     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5212 }
5213
5214 static bool
5215 encode_coding_raw_text (struct coding_system *coding)
5216 {
5217   bool multibytep = coding->dst_multibyte;
5218   int *charbuf = coding->charbuf;
5219   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5220   unsigned char *dst = coding->destination + coding->produced;
5221   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5222   ptrdiff_t produced_chars = 0;
5223   int c;
5224
5225   if (multibytep)
5226     {
5227       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5228
5229       if (coding->src_multibyte)
5230         while (charbuf < charbuf_end)
5231           {
5232             ASSURE_DESTINATION (safe_room);
5233             c = *charbuf++;
5234             if (ASCII_CHAR_P (c))
5235               EMIT_ONE_ASCII_BYTE (c);
5236             else if (CHAR_BYTE8_P (c))
5237               {
5238                 c = CHAR_TO_BYTE8 (c);
5239                 EMIT_ONE_BYTE (c);
5240               }
5241             else
5242               {
5243                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5244
5245                 CHAR_STRING_ADVANCE (c, p1);
5246                 do
5247                   {
5248                     EMIT_ONE_BYTE (*p0);
5249                     p0++;
5250                   }
5251                 while (p0 < p1);
5252               }
5253           }
5254       else
5255         while (charbuf < charbuf_end)
5256           {
5257             ASSURE_DESTINATION (safe_room);
5258             c = *charbuf++;
5259             EMIT_ONE_BYTE (c);
5260           }
5261     }
5262   else
5263     {
5264       if (coding->src_multibyte)
5265         {
5266           int safe_room = MAX_MULTIBYTE_LENGTH;
5267
5268           while (charbuf < charbuf_end)
5269             {
5270               ASSURE_DESTINATION (safe_room);
5271               c = *charbuf++;
5272               if (ASCII_CHAR_P (c))
5273                 *dst++ = c;
5274               else if (CHAR_BYTE8_P (c))
5275                 *dst++ = CHAR_TO_BYTE8 (c);
5276               else
5277                 CHAR_STRING_ADVANCE (c, dst);
5278             }
5279         }
5280       else
5281         {
5282           ASSURE_DESTINATION (charbuf_end - charbuf);
5283           while (charbuf < charbuf_end && dst < dst_end)
5284             *dst++ = *charbuf++;
5285         }
5286       produced_chars = dst - (coding->destination + coding->produced);
5287     }
5288   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5289   coding->produced_char += produced_chars;
5290   coding->produced = dst - coding->destination;
5291   return 0;
5292 }
5293
5294 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5295    Return true if a text is encoded in a charset-based coding system.  */
5296
5297 static bool
5298 detect_coding_charset (struct coding_system *coding,
5299                        struct coding_detection_info *detect_info)
5300 {
5301   const unsigned char *src = coding->source, *src_base;
5302   const unsigned char *src_end = coding->source + coding->src_bytes;
5303   bool multibytep = coding->src_multibyte;
5304   ptrdiff_t consumed_chars = 0;
5305   Lisp_Object attrs, valids, name;
5306   int found = 0;
5307   ptrdiff_t head_ascii = coding->head_ascii;
5308   bool check_latin_extra = 0;
5309
5310   detect_info->checked |= CATEGORY_MASK_CHARSET;
5311
5312   coding = &coding_categories[coding_category_charset];
5313   attrs = CODING_ID_ATTRS (coding->id);
5314   valids = AREF (attrs, coding_attr_charset_valids);
5315   name = CODING_ID_NAME (coding->id);
5316   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5317                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5318       || strncmp (SSDATA (SYMBOL_NAME (name)),
5319                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5320     check_latin_extra = 1;
5321
5322   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5323     src += head_ascii;
5324
5325   while (1)
5326     {
5327       int c;
5328       Lisp_Object val;
5329       struct charset *charset;
5330       int dim, idx;
5331
5332       src_base = src;
5333       ONE_MORE_BYTE (c);
5334       if (c < 0)
5335         continue;
5336       val = AREF (valids, c);
5337       if (NILP (val))
5338         break;
5339       if (c >= 0x80)
5340         {
5341           if (c < 0xA0
5342               && check_latin_extra
5343               && (!VECTORP (Vlatin_extra_code_table)
5344                   || NILP (AREF (Vlatin_extra_code_table, c))))
5345             break;
5346           found = CATEGORY_MASK_CHARSET;
5347         }
5348       if (INTEGERP (val))
5349         {
5350           charset = CHARSET_FROM_ID (XFASTINT (val));
5351           dim = CHARSET_DIMENSION (charset);
5352           for (idx = 1; idx < dim; idx++)
5353             {
5354               if (src == src_end)
5355                 goto too_short;
5356               ONE_MORE_BYTE (c);
5357               if (c < charset->code_space[(dim - 1 - idx) * 4]
5358                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5359                 break;
5360             }
5361           if (idx < dim)
5362             break;
5363         }
5364       else
5365         {
5366           idx = 1;
5367           for (; CONSP (val); val = XCDR (val))
5368             {
5369               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5370               dim = CHARSET_DIMENSION (charset);
5371               while (idx < dim)
5372                 {
5373                   if (src == src_end)
5374                     goto too_short;
5375                   ONE_MORE_BYTE (c);
5376                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5377                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5378                     break;
5379                   idx++;
5380                 }
5381               if (idx == dim)
5382                 {
5383                   val = Qnil;
5384                   break;
5385                 }
5386             }
5387           if (CONSP (val))
5388             break;
5389         }
5390     }
5391  too_short:
5392   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5393   return 0;
5394
5395  no_more_source:
5396   detect_info->found |= found;
5397   return 1;
5398 }
5399
5400 static void
5401 decode_coding_charset (struct coding_system *coding)
5402 {
5403   const unsigned char *src = coding->source + coding->consumed;
5404   const unsigned char *src_end = coding->source + coding->src_bytes;
5405   const unsigned char *src_base;
5406   int *charbuf = coding->charbuf + coding->charbuf_used;
5407   /* We may produce one charset annotation in one loop and one more at
5408      the end.  */
5409   int *charbuf_end
5410     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5411   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5412   bool multibytep = coding->src_multibyte;
5413   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5414   Lisp_Object valids;
5415   ptrdiff_t char_offset = coding->produced_char;
5416   ptrdiff_t last_offset = char_offset;
5417   int last_id = charset_ascii;
5418   bool eol_dos
5419     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5420   int byte_after_cr = -1;
5421
5422   valids = AREF (attrs, coding_attr_charset_valids);
5423
5424   while (1)
5425     {
5426       int c;
5427       Lisp_Object val;
5428       struct charset *charset;
5429       int dim;
5430       int len = 1;
5431       unsigned code;
5432
5433       src_base = src;
5434       consumed_chars_base = consumed_chars;
5435
5436       if (charbuf >= charbuf_end)
5437         {
5438           if (byte_after_cr >= 0)
5439             src_base--;
5440           break;
5441         }
5442
5443       if (byte_after_cr >= 0)
5444         {
5445           c = byte_after_cr;
5446           byte_after_cr = -1;
5447         }
5448       else
5449         {
5450           ONE_MORE_BYTE (c);
5451           if (eol_dos && c == '\r')
5452             ONE_MORE_BYTE (byte_after_cr);
5453         }
5454       if (c < 0)
5455         goto invalid_code;
5456       code = c;
5457
5458       val = AREF (valids, c);
5459       if (! INTEGERP (val) && ! CONSP (val))
5460         goto invalid_code;
5461       if (INTEGERP (val))
5462         {
5463           charset = CHARSET_FROM_ID (XFASTINT (val));
5464           dim = CHARSET_DIMENSION (charset);
5465           while (len < dim)
5466             {
5467               ONE_MORE_BYTE (c);
5468               code = (code << 8) | c;
5469               len++;
5470             }
5471           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5472                               charset, code, c);
5473         }
5474       else
5475         {
5476           /* VAL is a list of charset IDs.  It is assured that the
5477              list is sorted by charset dimensions (smaller one
5478              comes first).  */
5479           while (CONSP (val))
5480             {
5481               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5482               dim = CHARSET_DIMENSION (charset);
5483               while (len < dim)
5484                 {
5485                   ONE_MORE_BYTE (c);
5486                   code = (code << 8) | c;
5487                   len++;
5488                 }
5489               CODING_DECODE_CHAR (coding, src, src_base,
5490                                   src_end, charset, code, c);
5491               if (c >= 0)
5492                 break;
5493               val = XCDR (val);
5494             }
5495         }
5496       if (c < 0)
5497         goto invalid_code;
5498       if (charset->id != charset_ascii
5499           && last_id != charset->id)
5500         {
5501           if (last_id != charset_ascii)
5502             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5503           last_id = charset->id;
5504           last_offset = char_offset;
5505         }
5506
5507       *charbuf++ = c;
5508       char_offset++;
5509       continue;
5510
5511     invalid_code:
5512       src = src_base;
5513       consumed_chars = consumed_chars_base;
5514       ONE_MORE_BYTE (c);
5515       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5516       char_offset++;
5517       coding->errors++;
5518     }
5519
5520  no_more_source:
5521   if (last_id != charset_ascii)
5522     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5523   coding->consumed_char += consumed_chars_base;
5524   coding->consumed = src_base - coding->source;
5525   coding->charbuf_used = charbuf - coding->charbuf;
5526 }
5527
5528 static bool
5529 encode_coding_charset (struct coding_system *coding)
5530 {
5531   bool multibytep = coding->dst_multibyte;
5532   int *charbuf = coding->charbuf;
5533   int *charbuf_end = charbuf + coding->charbuf_used;
5534   unsigned char *dst = coding->destination + coding->produced;
5535   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5536   int safe_room = MAX_MULTIBYTE_LENGTH;
5537   ptrdiff_t produced_chars = 0;
5538   Lisp_Object attrs, charset_list;
5539   bool ascii_compatible;
5540   int c;
5541
5542   CODING_GET_INFO (coding, attrs, charset_list);
5543   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5544
5545   while (charbuf < charbuf_end)
5546     {
5547       struct charset *charset;
5548       unsigned code;
5549
5550       ASSURE_DESTINATION (safe_room);
5551       c = *charbuf++;
5552       if (ascii_compatible && ASCII_CHAR_P (c))
5553         EMIT_ONE_ASCII_BYTE (c);
5554       else if (CHAR_BYTE8_P (c))
5555         {
5556           c = CHAR_TO_BYTE8 (c);
5557           EMIT_ONE_BYTE (c);
5558         }
5559       else
5560         {
5561           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5562                                &code, charset);
5563
5564           if (charset)
5565             {
5566               if (CHARSET_DIMENSION (charset) == 1)
5567                 EMIT_ONE_BYTE (code);
5568               else if (CHARSET_DIMENSION (charset) == 2)
5569                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5570               else if (CHARSET_DIMENSION (charset) == 3)
5571                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5572               else
5573                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5574                                  (code >> 8) & 0xFF, code & 0xFF);
5575             }
5576           else
5577             {
5578               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5579                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5580               else
5581                 c = coding->default_char;
5582               EMIT_ONE_BYTE (c);
5583             }
5584         }
5585     }
5586
5587   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5588   coding->produced_char += produced_chars;
5589   coding->produced = dst - coding->destination;
5590   return 0;
5591 }
5592
5593 \f
5594 /*** 7. C library functions ***/
5595
5596 /* Setup coding context CODING from information about CODING_SYSTEM.
5597    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5598    CODING_SYSTEM is invalid, signal an error.  */
5599
5600 void
5601 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5602 {
5603   Lisp_Object attrs;
5604   Lisp_Object eol_type;
5605   Lisp_Object coding_type;
5606   Lisp_Object val;
5607
5608   if (NILP (coding_system))
5609     coding_system = Qundecided;
5610
5611   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5612
5613   attrs = CODING_ID_ATTRS (coding->id);
5614   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5615
5616   coding->mode = 0;
5617   coding->head_ascii = -1;
5618   if (VECTORP (eol_type))
5619     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5620                             | CODING_REQUIRE_DETECTION_MASK);
5621   else if (! EQ (eol_type, Qunix))
5622     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5623                             | CODING_REQUIRE_ENCODING_MASK);
5624   else
5625     coding->common_flags = 0;
5626   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5627     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5628   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5629     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5630   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5631     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5632
5633   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5634   coding->max_charset_id = SCHARS (val) - 1;
5635   coding->safe_charsets = SDATA (val);
5636   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5637   coding->carryover_bytes = 0;
5638
5639   coding_type = CODING_ATTR_TYPE (attrs);
5640   if (EQ (coding_type, Qundecided))
5641     {
5642       coding->detector = NULL;
5643       coding->decoder = decode_coding_raw_text;
5644       coding->encoder = encode_coding_raw_text;
5645       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5646     }
5647   else if (EQ (coding_type, Qiso_2022))
5648     {
5649       int i;
5650       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5651
5652       /* Invoke graphic register 0 to plane 0.  */
5653       CODING_ISO_INVOCATION (coding, 0) = 0;
5654       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5655       CODING_ISO_INVOCATION (coding, 1)
5656         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5657       /* Setup the initial status of designation.  */
5658       for (i = 0; i < 4; i++)
5659         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5660       /* Not single shifting initially.  */
5661       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5662       /* Beginning of buffer should also be regarded as bol. */
5663       CODING_ISO_BOL (coding) = 1;
5664       coding->detector = detect_coding_iso_2022;
5665       coding->decoder = decode_coding_iso_2022;
5666       coding->encoder = encode_coding_iso_2022;
5667       if (flags & CODING_ISO_FLAG_SAFE)
5668         coding->mode |= CODING_MODE_SAFE_ENCODING;
5669       coding->common_flags
5670         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5671             | CODING_REQUIRE_FLUSHING_MASK);
5672       if (flags & CODING_ISO_FLAG_COMPOSITION)
5673         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5674       if (flags & CODING_ISO_FLAG_DESIGNATION)
5675         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5676       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5677         {
5678           setup_iso_safe_charsets (attrs);
5679           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5680           coding->max_charset_id = SCHARS (val) - 1;
5681           coding->safe_charsets = SDATA (val);
5682         }
5683       CODING_ISO_FLAGS (coding) = flags;
5684       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5685       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5686       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5687       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5688     }
5689   else if (EQ (coding_type, Qcharset))
5690     {
5691       coding->detector = detect_coding_charset;
5692       coding->decoder = decode_coding_charset;
5693       coding->encoder = encode_coding_charset;
5694       coding->common_flags
5695         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5696     }
5697   else if (EQ (coding_type, Qutf_8))
5698     {
5699       val = AREF (attrs, coding_attr_utf_bom);
5700       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5701                                    : EQ (val, Qt) ? utf_with_bom
5702                                    : utf_without_bom);
5703       coding->detector = detect_coding_utf_8;
5704       coding->decoder = decode_coding_utf_8;
5705       coding->encoder = encode_coding_utf_8;
5706       coding->common_flags
5707         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5708       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5709         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5710     }
5711   else if (EQ (coding_type, Qutf_16))
5712     {
5713       val = AREF (attrs, coding_attr_utf_bom);
5714       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5715                                     : EQ (val, Qt) ? utf_with_bom
5716                                     : utf_without_bom);
5717       val = AREF (attrs, coding_attr_utf_16_endian);
5718       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5719                                        : utf_16_little_endian);
5720       CODING_UTF_16_SURROGATE (coding) = 0;
5721       coding->detector = detect_coding_utf_16;
5722       coding->decoder = decode_coding_utf_16;
5723       coding->encoder = encode_coding_utf_16;
5724       coding->common_flags
5725         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5726       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5727         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5728     }
5729   else if (EQ (coding_type, Qccl))
5730     {
5731       coding->detector = detect_coding_ccl;
5732       coding->decoder = decode_coding_ccl;
5733       coding->encoder = encode_coding_ccl;
5734       coding->common_flags
5735         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5736             | CODING_REQUIRE_FLUSHING_MASK);
5737     }
5738   else if (EQ (coding_type, Qemacs_mule))
5739     {
5740       coding->detector = detect_coding_emacs_mule;
5741       coding->decoder = decode_coding_emacs_mule;
5742       coding->encoder = encode_coding_emacs_mule;
5743       coding->common_flags
5744         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5745       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5746           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5747         {
5748           Lisp_Object tail, safe_charsets;
5749           int max_charset_id = 0;
5750
5751           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5752                tail = XCDR (tail))
5753             if (max_charset_id < XFASTINT (XCAR (tail)))
5754               max_charset_id = XFASTINT (XCAR (tail));
5755           safe_charsets = make_uninit_string (max_charset_id + 1);
5756           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5757           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5758                tail = XCDR (tail))
5759             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5760           coding->max_charset_id = max_charset_id;
5761           coding->safe_charsets = SDATA (safe_charsets);
5762         }
5763       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5764       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5765     }
5766   else if (EQ (coding_type, Qshift_jis))
5767     {
5768       coding->detector = detect_coding_sjis;
5769       coding->decoder = decode_coding_sjis;
5770       coding->encoder = encode_coding_sjis;
5771       coding->common_flags
5772         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5773     }
5774   else if (EQ (coding_type, Qbig5))
5775     {
5776       coding->detector = detect_coding_big5;
5777       coding->decoder = decode_coding_big5;
5778       coding->encoder = encode_coding_big5;
5779       coding->common_flags
5780         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5781     }
5782   else                          /* EQ (coding_type, Qraw_text) */
5783     {
5784       coding->detector = NULL;
5785       coding->decoder = decode_coding_raw_text;
5786       coding->encoder = encode_coding_raw_text;
5787       if (! EQ (eol_type, Qunix))
5788         {
5789           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5790           if (! VECTORP (eol_type))
5791             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5792         }
5793
5794     }
5795
5796   return;
5797 }
5798
5799 /* Return a list of charsets supported by CODING.  */
5800
5801 Lisp_Object
5802 coding_charset_list (struct coding_system *coding)
5803 {
5804   Lisp_Object attrs, charset_list;
5805
5806   CODING_GET_INFO (coding, attrs, charset_list);
5807   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5808     {
5809       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5810
5811       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5812         charset_list = Viso_2022_charset_list;
5813     }
5814   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5815     {
5816       charset_list = Vemacs_mule_charset_list;
5817     }
5818   return charset_list;
5819 }
5820
5821
5822 /* Return a list of charsets supported by CODING-SYSTEM.  */
5823
5824 Lisp_Object
5825 coding_system_charset_list (Lisp_Object coding_system)
5826 {
5827   ptrdiff_t id;
5828   Lisp_Object attrs, charset_list;
5829
5830   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5831   attrs = CODING_ID_ATTRS (id);
5832
5833   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5834     {
5835       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5836
5837       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5838         charset_list = Viso_2022_charset_list;
5839       else
5840         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5841     }
5842   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5843     {
5844       charset_list = Vemacs_mule_charset_list;
5845     }
5846   else
5847     {
5848       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5849     }
5850   return charset_list;
5851 }
5852
5853
5854 /* Return raw-text or one of its subsidiaries that has the same
5855    eol_type as CODING-SYSTEM.  */
5856
5857 Lisp_Object
5858 raw_text_coding_system (Lisp_Object coding_system)
5859 {
5860   Lisp_Object spec, attrs;
5861   Lisp_Object eol_type, raw_text_eol_type;
5862
5863   if (NILP (coding_system))
5864     return Qraw_text;
5865   spec = CODING_SYSTEM_SPEC (coding_system);
5866   attrs = AREF (spec, 0);
5867
5868   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5869     return coding_system;
5870
5871   eol_type = AREF (spec, 2);
5872   if (VECTORP (eol_type))
5873     return Qraw_text;
5874   spec = CODING_SYSTEM_SPEC (Qraw_text);
5875   raw_text_eol_type = AREF (spec, 2);
5876   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5877           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5878           : AREF (raw_text_eol_type, 2));
5879 }
5880
5881
5882 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5883    the subsidiary that has the same eol-spec as PARENT (if it is not
5884    nil and specifies end-of-line format) or the system's setting
5885    (system_eol_type).  */
5886
5887 Lisp_Object
5888 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5889 {
5890   Lisp_Object spec, eol_type;
5891
5892   if (NILP (coding_system))
5893     coding_system = Qraw_text;
5894   spec = CODING_SYSTEM_SPEC (coding_system);
5895   eol_type = AREF (spec, 2);
5896   if (VECTORP (eol_type))
5897     {
5898       Lisp_Object parent_eol_type;
5899
5900       if (! NILP (parent))
5901         {
5902           Lisp_Object parent_spec;
5903
5904           parent_spec = CODING_SYSTEM_SPEC (parent);
5905           parent_eol_type = AREF (parent_spec, 2);
5906           if (VECTORP (parent_eol_type))
5907             parent_eol_type = system_eol_type;
5908         }
5909       else
5910         parent_eol_type = system_eol_type;
5911       if (EQ (parent_eol_type, Qunix))
5912         coding_system = AREF (eol_type, 0);
5913       else if (EQ (parent_eol_type, Qdos))
5914         coding_system = AREF (eol_type, 1);
5915       else if (EQ (parent_eol_type, Qmac))
5916         coding_system = AREF (eol_type, 2);
5917     }
5918   return coding_system;
5919 }
5920
5921
5922 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5923    decided for writing to a process.  If not, complement them, and
5924    return a new coding system.  */
5925
5926 Lisp_Object
5927 complement_process_encoding_system (Lisp_Object coding_system)
5928 {
5929   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5930   Lisp_Object spec, attrs;
5931   int i;
5932
5933   for (i = 0; i < 3; i++)
5934     {
5935       if (i == 1)
5936         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5937       else if (i == 2)
5938         coding_system = preferred_coding_system ();
5939       spec = CODING_SYSTEM_SPEC (coding_system);
5940       if (NILP (spec))
5941         continue;
5942       attrs = AREF (spec, 0);
5943       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5944         coding_base = CODING_ATTR_BASE_NAME (attrs);
5945       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5946         eol_base = coding_system;
5947       if (! NILP (coding_base) && ! NILP (eol_base))
5948         break;
5949     }
5950
5951   if (i > 0)
5952     /* The original CODING_SYSTEM didn't specify text-conversion or
5953        eol-conversion.  Be sure that we return a fully complemented
5954        coding system.  */
5955     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5956   return coding_system;
5957 }
5958
5959
5960 /* Emacs has a mechanism to automatically detect a coding system if it
5961    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5962    it's impossible to distinguish some coding systems accurately
5963    because they use the same range of codes.  So, at first, coding
5964    systems are categorized into 7, those are:
5965
5966    o coding-category-emacs-mule
5967
5968         The category for a coding system which has the same code range
5969         as Emacs' internal format.  Assigned the coding-system (Lisp
5970         symbol) `emacs-mule' by default.
5971
5972    o coding-category-sjis
5973
5974         The category for a coding system which has the same code range
5975         as SJIS.  Assigned the coding-system (Lisp
5976         symbol) `japanese-shift-jis' by default.
5977
5978    o coding-category-iso-7
5979
5980         The category for a coding system which has the same code range
5981         as ISO2022 of 7-bit environment.  This doesn't use any locking
5982         shift and single shift functions.  This can encode/decode all
5983         charsets.  Assigned the coding-system (Lisp symbol)
5984         `iso-2022-7bit' by default.
5985
5986    o coding-category-iso-7-tight
5987
5988         Same as coding-category-iso-7 except that this can
5989         encode/decode only the specified charsets.
5990
5991    o coding-category-iso-8-1
5992
5993         The category for a coding system which has the same code range
5994         as ISO2022 of 8-bit environment and graphic plane 1 used only
5995         for DIMENSION1 charset.  This doesn't use any locking shift
5996         and single shift functions.  Assigned the coding-system (Lisp
5997         symbol) `iso-latin-1' by default.
5998
5999    o coding-category-iso-8-2
6000
6001         The category for a coding system which has the same code range
6002         as ISO2022 of 8-bit environment and graphic plane 1 used only
6003         for DIMENSION2 charset.  This doesn't use any locking shift
6004         and single shift functions.  Assigned the coding-system (Lisp
6005         symbol) `japanese-iso-8bit' by default.
6006
6007    o coding-category-iso-7-else
6008
6009         The category for a coding system which has the same code range
6010         as ISO2022 of 7-bit environment but uses locking shift or
6011         single shift functions.  Assigned the coding-system (Lisp
6012         symbol) `iso-2022-7bit-lock' by default.
6013
6014    o coding-category-iso-8-else
6015
6016         The category for a coding system which has the same code range
6017         as ISO2022 of 8-bit environment but uses locking shift or
6018         single shift functions.  Assigned the coding-system (Lisp
6019         symbol) `iso-2022-8bit-ss2' by default.
6020
6021    o coding-category-big5
6022
6023         The category for a coding system which has the same code range
6024         as BIG5.  Assigned the coding-system (Lisp symbol)
6025         `cn-big5' by default.
6026
6027    o coding-category-utf-8
6028
6029         The category for a coding system which has the same code range
6030         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6031         symbol) `utf-8' by default.
6032
6033    o coding-category-utf-16-be
6034
6035         The category for a coding system in which a text has an
6036         Unicode signature (cf. Unicode Standard) in the order of BIG
6037         endian at the head.  Assigned the coding-system (Lisp symbol)
6038         `utf-16-be' by default.
6039
6040    o coding-category-utf-16-le
6041
6042         The category for a coding system in which a text has an
6043         Unicode signature (cf. Unicode Standard) in the order of
6044         LITTLE endian at the head.  Assigned the coding-system (Lisp
6045         symbol) `utf-16-le' by default.
6046
6047    o coding-category-ccl
6048
6049         The category for a coding system of which encoder/decoder is
6050         written in CCL programs.  The default value is nil, i.e., no
6051         coding system is assigned.
6052
6053    o coding-category-binary
6054
6055         The category for a coding system not categorized in any of the
6056         above.  Assigned the coding-system (Lisp symbol)
6057         `no-conversion' by default.
6058
6059    Each of them is a Lisp symbol and the value is an actual
6060    `coding-system's (this is also a Lisp symbol) assigned by a user.
6061    What Emacs does actually is to detect a category of coding system.
6062    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6063    decide only one possible category, it selects a category of the
6064    highest priority.  Priorities of categories are also specified by a
6065    user in a Lisp variable `coding-category-list'.
6066
6067 */
6068
6069 #define EOL_SEEN_NONE   0
6070 #define EOL_SEEN_LF     1
6071 #define EOL_SEEN_CR     2
6072 #define EOL_SEEN_CRLF   4
6073
6074
6075 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen);
6076
6077
6078 /* Return true iff all the source bytes are ASCII.
6079    By side effects, set coding->head_ascii and coding->eol_seen.  The
6080    value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
6081    EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
6082    all the source bytes are ASCII.  */
6083
6084 static bool
6085 detect_ascii (struct coding_system *coding)
6086 {
6087   const unsigned char *src, *end;
6088   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6089   int eol_seen;
6090
6091   eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
6092               : EQ (eol_type, Qunix) ? EOL_SEEN_LF
6093               : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6094               : EOL_SEEN_CR);
6095   coding_set_source (coding);
6096   src = coding->source;
6097   end = src + coding->src_bytes;
6098
6099   if (inhibit_eol_conversion)
6100     {
6101       /* We don't have to check EOL format.  */
6102       while (src < end && !( *src & 0x80)) src++;
6103       eol_seen = EOL_SEEN_LF;
6104       adjust_coding_eol_type (coding, eol_seen);
6105     }
6106   else if (eol_seen != EOL_SEEN_NONE)
6107     {
6108       /* We don't have to check EOL format either.  */
6109       while (src < end && !(*src & 0x80)) src++;
6110     }
6111   else
6112     {
6113       end--;                    /* We look ahead one byte.  */
6114       while (src < end)
6115         {
6116           int c = *src;
6117
6118           if (c & 0x80)
6119             break;
6120           src++;
6121           if (c < 0x20)
6122             {
6123               if (c == '\r')
6124                 {
6125                   if (*src == '\n')
6126                     {
6127                       eol_seen |= EOL_SEEN_CRLF;
6128                       src++;
6129                     }
6130                   else
6131                     eol_seen |= EOL_SEEN_CR;
6132                 }
6133               else if (c == '\n')
6134                 eol_seen |= EOL_SEEN_LF;
6135             }
6136         }
6137       if (src > end)
6138         /* The last two bytes are CR LF, which means that we have
6139            scanned all bytes. */
6140         end++;
6141       else if (src == end)
6142         {
6143           end++;
6144           if (! (*src & 0x80))
6145             {
6146               if (*src == '\r')
6147                 eol_seen |= EOL_SEEN_CR;
6148               else if (*src  == '\n')
6149                 eol_seen |= EOL_SEEN_LF;
6150               src++;
6151             }
6152         }
6153       adjust_coding_eol_type (coding, eol_seen);
6154     }
6155   coding->head_ascii = src - coding->source;
6156   coding->eol_seen = eol_seen;
6157   return (src == end);
6158 }
6159
6160
6161 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6162    SOURCE is encoded.  If CATEGORY is one of
6163    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6164    two-byte, else they are encoded by one-byte.
6165
6166    Return one of EOL_SEEN_XXX.  */
6167
6168 #define MAX_EOL_CHECK_COUNT 3
6169
6170 static int
6171 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6172             enum coding_category category)
6173 {
6174   const unsigned char *src = source, *src_end = src + src_bytes;
6175   unsigned char c;
6176   int total  = 0;
6177   int eol_seen = EOL_SEEN_NONE;
6178
6179   if ((1 << category) & CATEGORY_MASK_UTF_16)
6180     {
6181       bool msb = category == (coding_category_utf_16_le
6182                               | coding_category_utf_16_le_nosig);
6183       bool lsb = !msb;
6184
6185       while (src + 1 < src_end)
6186         {
6187           c = src[lsb];
6188           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6189             {
6190               int this_eol;
6191
6192               if (c == '\n')
6193                 this_eol = EOL_SEEN_LF;
6194               else if (src + 3 >= src_end
6195                        || src[msb + 2] != 0
6196                        || src[lsb + 2] != '\n')
6197                 this_eol = EOL_SEEN_CR;
6198               else
6199                 {
6200                   this_eol = EOL_SEEN_CRLF;
6201                   src += 2;
6202                 }
6203
6204               if (eol_seen == EOL_SEEN_NONE)
6205                 /* This is the first end-of-line.  */
6206                 eol_seen = this_eol;
6207               else if (eol_seen != this_eol)
6208                 {
6209                   /* The found type is different from what found before.
6210                      Allow for stray ^M characters in DOS EOL files.  */
6211                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6212                       || (eol_seen == EOL_SEEN_CRLF
6213                           && this_eol == EOL_SEEN_CR))
6214                     eol_seen = EOL_SEEN_CRLF;
6215                   else
6216                     {
6217                       eol_seen = EOL_SEEN_LF;
6218                       break;
6219                     }
6220                 }
6221               if (++total == MAX_EOL_CHECK_COUNT)
6222                 break;
6223             }
6224           src += 2;
6225         }
6226     }
6227   else
6228     while (src < src_end)
6229       {
6230         c = *src++;
6231         if (c == '\n' || c == '\r')
6232           {
6233             int this_eol;
6234
6235             if (c == '\n')
6236               this_eol = EOL_SEEN_LF;
6237             else if (src >= src_end || *src != '\n')
6238               this_eol = EOL_SEEN_CR;
6239             else
6240               this_eol = EOL_SEEN_CRLF, src++;
6241
6242             if (eol_seen == EOL_SEEN_NONE)
6243               /* This is the first end-of-line.  */
6244               eol_seen = this_eol;
6245             else if (eol_seen != this_eol)
6246               {
6247                 /* The found type is different from what found before.
6248                    Allow for stray ^M characters in DOS EOL files.  */
6249                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6250                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6251                   eol_seen = EOL_SEEN_CRLF;
6252                 else
6253                   {
6254                     eol_seen = EOL_SEEN_LF;
6255                     break;
6256                   }
6257               }
6258             if (++total == MAX_EOL_CHECK_COUNT)
6259               break;
6260           }
6261       }
6262   return eol_seen;
6263 }
6264
6265
6266 static Lisp_Object
6267 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6268 {
6269   Lisp_Object eol_type;
6270
6271   eol_type = CODING_ID_EOL_TYPE (coding->id);
6272   if (eol_seen & EOL_SEEN_LF)
6273     {
6274       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6275       eol_type = Qunix;
6276     }
6277   else if (eol_seen & EOL_SEEN_CRLF)
6278     {
6279       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6280       eol_type = Qdos;
6281     }
6282   else if (eol_seen & EOL_SEEN_CR)
6283     {
6284       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6285       eol_type = Qmac;
6286     }
6287   return eol_type;
6288 }
6289
6290 /* Detect how a text specified in CODING is encoded.  If a coding
6291    system is detected, update fields of CODING by the detected coding
6292    system.  */
6293
6294 static void
6295 detect_coding (struct coding_system *coding)
6296 {
6297   const unsigned char *src, *src_end;
6298   unsigned int saved_mode = coding->mode;
6299
6300   coding->consumed = coding->consumed_char = 0;
6301   coding->produced = coding->produced_char = 0;
6302   coding_set_source (coding);
6303
6304   src_end = coding->source + coding->src_bytes;
6305
6306   /* If we have not yet decided the text encoding type, detect it
6307      now.  */
6308   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6309     {
6310       int c, i;
6311       struct coding_detection_info detect_info;
6312       bool null_byte_found = 0, eight_bit_found = 0;
6313
6314       coding->head_ascii = 0;
6315       coding->eol_seen = EOL_SEEN_NONE;
6316       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6317       for (src = coding->source; src < src_end; src++)
6318         {
6319           c = *src;
6320           if (c & 0x80)
6321             {
6322               eight_bit_found = 1;
6323               if (null_byte_found)
6324                 break;
6325             }
6326           else if (c < 0x20)
6327             {
6328               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6329                   && ! inhibit_iso_escape_detection
6330                   && ! detect_info.checked)
6331                 {
6332                   if (detect_coding_iso_2022 (coding, &detect_info))
6333                     {
6334                       /* We have scanned the whole data.  */
6335                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6336                         {
6337                           /* We didn't find an 8-bit code.  We may
6338                              have found a null-byte, but it's very
6339                              rare that a binary file conforms to
6340                              ISO-2022.  */
6341                           src = src_end;
6342                           coding->head_ascii = src - coding->source;
6343                         }
6344                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6345                       break;
6346                     }
6347                 }
6348               else if (! c && !inhibit_null_byte_detection)
6349                 {
6350                   null_byte_found = 1;
6351                   if (eight_bit_found)
6352                     break;
6353                 }
6354               else if (! disable_ascii_optimization
6355                        && ! inhibit_eol_conversion)
6356                 {
6357                   if (c == '\r')
6358                     {
6359                       if (src < src_end && src[1] == '\n')
6360                         {
6361                           coding->eol_seen |= EOL_SEEN_CRLF;
6362                           src++;
6363                           coding->head_ascii++;
6364                         }
6365                       else
6366                         coding->eol_seen |= EOL_SEEN_CR;
6367                     }
6368                   else if (c == '\n')
6369                     {
6370                       coding->eol_seen |= EOL_SEEN_LF;
6371                     }
6372                 }
6373
6374               if (! eight_bit_found)
6375                 coding->head_ascii++;
6376             }
6377           else if (! eight_bit_found)
6378             coding->head_ascii++;
6379         }
6380
6381       if (null_byte_found || eight_bit_found
6382           || coding->head_ascii < coding->src_bytes
6383           || detect_info.found)
6384         {
6385           enum coding_category category;
6386           struct coding_system *this;
6387
6388           if (coding->head_ascii == coding->src_bytes)
6389             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6390             for (i = 0; i < coding_category_raw_text; i++)
6391               {
6392                 category = coding_priorities[i];
6393                 this = coding_categories + category;
6394                 if (detect_info.found & (1 << category))
6395                   break;
6396               }
6397           else
6398             {
6399               if (null_byte_found)
6400                 {
6401                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6402                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6403                 }
6404               for (i = 0; i < coding_category_raw_text; i++)
6405                 {
6406                   category = coding_priorities[i];
6407                   this = coding_categories + category;
6408                   /* Some of this->detector (e.g. detect_coding_sjis)
6409                      require this information.  */
6410                   coding->id = this->id;
6411                   if (this->id < 0)
6412                     {
6413                       /* No coding system of this category is defined.  */
6414                       detect_info.rejected |= (1 << category);
6415                     }
6416                   else if (category >= coding_category_raw_text)
6417                     continue;
6418                   else if (detect_info.checked & (1 << category))
6419                     {
6420                       if (detect_info.found & (1 << category))
6421                         break;
6422                     }
6423                   else if ((*(this->detector)) (coding, &detect_info)
6424                            && detect_info.found & (1 << category))
6425                     {
6426                       if (category == coding_category_utf_16_auto)
6427                         {
6428                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6429                             category = coding_category_utf_16_le;
6430                           else
6431                             category = coding_category_utf_16_be;
6432                         }
6433                       break;
6434                     }
6435                 }
6436             }
6437
6438           if (i < coding_category_raw_text)
6439             setup_coding_system (CODING_ID_NAME (this->id), coding);
6440           else if (null_byte_found)
6441             setup_coding_system (Qno_conversion, coding);
6442           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6443                    == CATEGORY_MASK_ANY)
6444             setup_coding_system (Qraw_text, coding);
6445           else if (detect_info.rejected)
6446             for (i = 0; i < coding_category_raw_text; i++)
6447               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6448                 {
6449                   this = coding_categories + coding_priorities[i];
6450                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6451                   break;
6452                 }
6453         }
6454     }
6455   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6456            == coding_category_utf_8_auto)
6457     {
6458       Lisp_Object coding_systems;
6459       struct coding_detection_info detect_info;
6460
6461       coding_systems
6462         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6463       detect_info.found = detect_info.rejected = 0;
6464       if (detect_ascii (coding))
6465         {
6466           setup_coding_system (XCDR (coding_systems), coding);
6467         }
6468       else
6469         {
6470           if (CONSP (coding_systems)
6471               && detect_coding_utf_8 (coding, &detect_info))
6472             {
6473               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6474                 setup_coding_system (XCAR (coding_systems), coding);
6475               else
6476                 setup_coding_system (XCDR (coding_systems), coding);
6477             }
6478         }
6479     }
6480   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6481            == coding_category_utf_16_auto)
6482     {
6483       Lisp_Object coding_systems;
6484       struct coding_detection_info detect_info;
6485
6486       coding_systems
6487         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6488       detect_info.found = detect_info.rejected = 0;
6489       coding->head_ascii = 0;
6490       coding->eol_seen = EOL_SEEN_NONE;
6491       if (CONSP (coding_systems)
6492           && detect_coding_utf_16 (coding, &detect_info))
6493         {
6494           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6495             setup_coding_system (XCAR (coding_systems), coding);
6496           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6497             setup_coding_system (XCDR (coding_systems), coding);
6498         }
6499     }
6500   coding->mode = saved_mode;
6501 }
6502
6503
6504 static void
6505 decode_eol (struct coding_system *coding)
6506 {
6507   Lisp_Object eol_type;
6508   unsigned char *p, *pbeg, *pend;
6509
6510   eol_type = CODING_ID_EOL_TYPE (coding->id);
6511   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6512     return;
6513
6514   if (NILP (coding->dst_object))
6515     pbeg = coding->destination;
6516   else
6517     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6518   pend = pbeg + coding->produced;
6519
6520   if (VECTORP (eol_type))
6521     {
6522       int eol_seen = EOL_SEEN_NONE;
6523
6524       for (p = pbeg; p < pend; p++)
6525         {
6526           if (*p == '\n')
6527             eol_seen |= EOL_SEEN_LF;
6528           else if (*p == '\r')
6529             {
6530               if (p + 1 < pend && *(p + 1) == '\n')
6531                 {
6532                   eol_seen |= EOL_SEEN_CRLF;
6533                   p++;
6534                 }
6535               else
6536                 eol_seen |= EOL_SEEN_CR;
6537             }
6538         }
6539       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6540       if ((eol_seen & EOL_SEEN_CRLF) != 0
6541           && (eol_seen & EOL_SEEN_CR) != 0
6542           && (eol_seen & EOL_SEEN_LF) == 0)
6543         eol_seen = EOL_SEEN_CRLF;
6544       else if (eol_seen != EOL_SEEN_NONE
6545           && eol_seen != EOL_SEEN_LF
6546           && eol_seen != EOL_SEEN_CRLF
6547           && eol_seen != EOL_SEEN_CR)
6548         eol_seen = EOL_SEEN_LF;
6549       if (eol_seen != EOL_SEEN_NONE)
6550         eol_type = adjust_coding_eol_type (coding, eol_seen);
6551     }
6552
6553   if (EQ (eol_type, Qmac))
6554     {
6555       for (p = pbeg; p < pend; p++)
6556         if (*p == '\r')
6557           *p = '\n';
6558     }
6559   else if (EQ (eol_type, Qdos))
6560     {
6561       ptrdiff_t n = 0;
6562
6563       if (NILP (coding->dst_object))
6564         {
6565           /* Start deleting '\r' from the tail to minimize the memory
6566              movement.  */
6567           for (p = pend - 2; p >= pbeg; p--)
6568             if (*p == '\r')
6569               {
6570                 memmove (p, p + 1, pend-- - p - 1);
6571                 n++;
6572               }
6573         }
6574       else
6575         {
6576           ptrdiff_t pos_byte = coding->dst_pos_byte;
6577           ptrdiff_t pos = coding->dst_pos;
6578           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6579
6580           while (pos < pos_end)
6581             {
6582               p = BYTE_POS_ADDR (pos_byte);
6583               if (*p == '\r' && p[1] == '\n')
6584                 {
6585                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6586                   n++;
6587                   pos_end--;
6588                 }
6589               pos++;
6590               if (coding->dst_multibyte)
6591                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6592               else
6593                 pos_byte++;
6594             }
6595         }
6596       coding->produced -= n;
6597       coding->produced_char -= n;
6598     }
6599 }
6600
6601
6602 /* Return a translation table (or list of them) from coding system
6603    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6604    not ENCODEP). */
6605
6606 static Lisp_Object
6607 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6608 {
6609   Lisp_Object standard, translation_table;
6610   Lisp_Object val;
6611
6612   if (NILP (Venable_character_translation))
6613     {
6614       if (max_lookup)
6615         *max_lookup = 0;
6616       return Qnil;
6617     }
6618   if (encodep)
6619     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6620       standard = Vstandard_translation_table_for_encode;
6621   else
6622     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6623       standard = Vstandard_translation_table_for_decode;
6624   if (NILP (translation_table))
6625     translation_table = standard;
6626   else
6627     {
6628       if (SYMBOLP (translation_table))
6629         translation_table = Fget (translation_table, Qtranslation_table);
6630       else if (CONSP (translation_table))
6631         {
6632           translation_table = Fcopy_sequence (translation_table);
6633           for (val = translation_table; CONSP (val); val = XCDR (val))
6634             if (SYMBOLP (XCAR (val)))
6635               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6636         }
6637       if (CHAR_TABLE_P (standard))
6638         {
6639           if (CONSP (translation_table))
6640             translation_table = nconc2 (translation_table,
6641                                         Fcons (standard, Qnil));
6642           else
6643             translation_table = Fcons (translation_table,
6644                                        Fcons (standard, Qnil));
6645         }
6646     }
6647
6648   if (max_lookup)
6649     {
6650       *max_lookup = 1;
6651       if (CHAR_TABLE_P (translation_table)
6652           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6653         {
6654           val = XCHAR_TABLE (translation_table)->extras[1];
6655           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6656             *max_lookup = XFASTINT (val);
6657         }
6658       else if (CONSP (translation_table))
6659         {
6660           Lisp_Object tail;
6661
6662           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6663             if (CHAR_TABLE_P (XCAR (tail))
6664                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6665               {
6666                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6667                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6668                   *max_lookup = XFASTINT (tailval);
6669               }
6670         }
6671     }
6672   return translation_table;
6673 }
6674
6675 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6676   do {                                                          \
6677     trans = Qnil;                                               \
6678     if (CHAR_TABLE_P (table))                                   \
6679       {                                                         \
6680         trans = CHAR_TABLE_REF (table, c);                      \
6681         if (CHARACTERP (trans))                                 \
6682           c = XFASTINT (trans), trans = Qnil;                   \
6683       }                                                         \
6684     else if (CONSP (table))                                     \
6685       {                                                         \
6686         Lisp_Object tail;                                       \
6687                                                                 \
6688         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6689           if (CHAR_TABLE_P (XCAR (tail)))                       \
6690             {                                                   \
6691               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6692               if (CHARACTERP (trans))                           \
6693                 c = XFASTINT (trans), trans = Qnil;             \
6694               else if (! NILP (trans))                          \
6695                 break;                                          \
6696             }                                                   \
6697       }                                                         \
6698   } while (0)
6699
6700
6701 /* Return a translation of character(s) at BUF according to TRANS.
6702    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6703    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6704    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6705    translation is found, and Qnil if not found..
6706    If BUF is too short to lookup characters in FROM, return Qt.  */
6707
6708 static Lisp_Object
6709 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6710 {
6711
6712   if (INTEGERP (trans))
6713     return trans;
6714   for (; CONSP (trans); trans = XCDR (trans))
6715     {
6716       Lisp_Object val = XCAR (trans);
6717       Lisp_Object from = XCAR (val);
6718       ptrdiff_t len = ASIZE (from);
6719       ptrdiff_t i;
6720
6721       for (i = 0; i < len; i++)
6722         {
6723           if (buf + i == buf_end)
6724             return Qt;
6725           if (XINT (AREF (from, i)) != buf[i])
6726             break;
6727         }
6728       if (i == len)
6729         return val;
6730     }
6731   return Qnil;
6732 }
6733
6734
6735 static int
6736 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6737                bool last_block)
6738 {
6739   unsigned char *dst = coding->destination + coding->produced;
6740   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6741   ptrdiff_t produced;
6742   ptrdiff_t produced_chars = 0;
6743   int carryover = 0;
6744
6745   if (! coding->chars_at_source)
6746     {
6747       /* Source characters are in coding->charbuf.  */
6748       int *buf = coding->charbuf;
6749       int *buf_end = buf + coding->charbuf_used;
6750
6751       if (EQ (coding->src_object, coding->dst_object))
6752         {
6753           coding_set_source (coding);
6754           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6755         }
6756
6757       while (buf < buf_end)
6758         {
6759           int c = *buf;
6760           ptrdiff_t i;
6761
6762           if (c >= 0)
6763             {
6764               ptrdiff_t from_nchars = 1, to_nchars = 1;
6765               Lisp_Object trans = Qnil;
6766
6767               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6768               if (! NILP (trans))
6769                 {
6770                   trans = get_translation (trans, buf, buf_end);
6771                   if (INTEGERP (trans))
6772                     c = XINT (trans);
6773                   else if (CONSP (trans))
6774                     {
6775                       from_nchars = ASIZE (XCAR (trans));
6776                       trans = XCDR (trans);
6777                       if (INTEGERP (trans))
6778                         c = XINT (trans);
6779                       else
6780                         {
6781                           to_nchars = ASIZE (trans);
6782                           c = XINT (AREF (trans, 0));
6783                         }
6784                     }
6785                   else if (EQ (trans, Qt) && ! last_block)
6786                     break;
6787                 }
6788
6789               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6790                 {
6791                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6792                        / MAX_MULTIBYTE_LENGTH)
6793                       < to_nchars)
6794                     memory_full (SIZE_MAX);
6795                   dst = alloc_destination (coding,
6796                                            buf_end - buf
6797                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6798                                            dst);
6799                   if (EQ (coding->src_object, coding->dst_object))
6800                     {
6801                       coding_set_source (coding);
6802                       dst_end = (((unsigned char *) coding->source)
6803                                  + coding->consumed);
6804                     }
6805                   else
6806                     dst_end = coding->destination + coding->dst_bytes;
6807                 }
6808
6809               for (i = 0; i < to_nchars; i++)
6810                 {
6811                   if (i > 0)
6812                     c = XINT (AREF (trans, i));
6813                   if (coding->dst_multibyte
6814                       || ! CHAR_BYTE8_P (c))
6815                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6816                   else
6817                     *dst++ = CHAR_TO_BYTE8 (c);
6818                 }
6819               produced_chars += to_nchars;
6820               buf += from_nchars;
6821             }
6822           else
6823             /* This is an annotation datum.  (-C) is the length.  */
6824             buf += -c;
6825         }
6826       carryover = buf_end - buf;
6827     }
6828   else
6829     {
6830       /* Source characters are at coding->source.  */
6831       const unsigned char *src = coding->source;
6832       const unsigned char *src_end = src + coding->consumed;
6833
6834       if (EQ (coding->dst_object, coding->src_object))
6835         dst_end = (unsigned char *) src;
6836       if (coding->src_multibyte != coding->dst_multibyte)
6837         {
6838           if (coding->src_multibyte)
6839             {
6840               bool multibytep = 1;
6841               ptrdiff_t consumed_chars = 0;
6842
6843               while (1)
6844                 {
6845                   const unsigned char *src_base = src;
6846                   int c;
6847
6848                   ONE_MORE_BYTE (c);
6849                   if (dst == dst_end)
6850                     {
6851                       if (EQ (coding->src_object, coding->dst_object))
6852                         dst_end = (unsigned char *) src;
6853                       if (dst == dst_end)
6854                         {
6855                           ptrdiff_t offset = src - coding->source;
6856
6857                           dst = alloc_destination (coding, src_end - src + 1,
6858                                                    dst);
6859                           dst_end = coding->destination + coding->dst_bytes;
6860                           coding_set_source (coding);
6861                           src = coding->source + offset;
6862                           src_end = coding->source + coding->consumed;
6863                           if (EQ (coding->src_object, coding->dst_object))
6864                             dst_end = (unsigned char *) src;
6865                         }
6866                     }
6867                   *dst++ = c;
6868                   produced_chars++;
6869                 }
6870             no_more_source:
6871               ;
6872             }
6873           else
6874             while (src < src_end)
6875               {
6876                 bool multibytep = 1;
6877                 int c = *src++;
6878
6879                 if (dst >= dst_end - 1)
6880                   {
6881                     if (EQ (coding->src_object, coding->dst_object))
6882                       dst_end = (unsigned char *) src;
6883                     if (dst >= dst_end - 1)
6884                       {
6885                         ptrdiff_t offset = src - coding->source;
6886                         ptrdiff_t more_bytes;
6887
6888                         if (EQ (coding->src_object, coding->dst_object))
6889                           more_bytes = ((src_end - src) / 2) + 2;
6890                         else
6891                           more_bytes = src_end - src + 2;
6892                         dst = alloc_destination (coding, more_bytes, dst);
6893                         dst_end = coding->destination + coding->dst_bytes;
6894                         coding_set_source (coding);
6895                         src = coding->source + offset;
6896                         src_end = coding->source + coding->consumed;
6897                         if (EQ (coding->src_object, coding->dst_object))
6898                           dst_end = (unsigned char *) src;
6899                       }
6900                   }
6901                 EMIT_ONE_BYTE (c);
6902               }
6903         }
6904       else
6905         {
6906           if (!EQ (coding->src_object, coding->dst_object))
6907             {
6908               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6909
6910               if (require > 0)
6911                 {
6912                   ptrdiff_t offset = src - coding->source;
6913
6914                   dst = alloc_destination (coding, require, dst);
6915                   coding_set_source (coding);
6916                   src = coding->source + offset;
6917                   src_end = coding->source + coding->consumed;
6918                 }
6919             }
6920           produced_chars = coding->consumed_char;
6921           while (src < src_end)
6922             *dst++ = *src++;
6923         }
6924     }
6925
6926   produced = dst - (coding->destination + coding->produced);
6927   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6928     insert_from_gap (produced_chars, produced, 0);
6929   coding->produced += produced;
6930   coding->produced_char += produced_chars;
6931   return carryover;
6932 }
6933
6934 /* Compose text in CODING->object according to the annotation data at
6935    CHARBUF.  CHARBUF is an array:
6936      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6937  */
6938
6939 static void
6940 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6941 {
6942   int len;
6943   ptrdiff_t to;
6944   enum composition_method method;
6945   Lisp_Object components;
6946
6947   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6948   to = pos + charbuf[2];
6949   method = (enum composition_method) (charbuf[4]);
6950
6951   if (method == COMPOSITION_RELATIVE)
6952     components = Qnil;
6953   else
6954     {
6955       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6956       int i, j;
6957
6958       if (method == COMPOSITION_WITH_RULE)
6959         len = charbuf[2] * 3 - 2;
6960       charbuf += MAX_ANNOTATION_LENGTH;
6961       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6962       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6963         {
6964           if (charbuf[i] >= 0)
6965             args[j] = make_number (charbuf[i]);
6966           else
6967             {
6968               i++;
6969               args[j] = make_number (charbuf[i] % 0x100);
6970             }
6971         }
6972       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6973     }
6974   compose_text (pos, to, components, Qnil, coding->dst_object);
6975 }
6976
6977
6978 /* Put `charset' property on text in CODING->object according to
6979    the annotation data at CHARBUF.  CHARBUF is an array:
6980      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6981  */
6982
6983 static void
6984 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6985 {
6986   ptrdiff_t from = pos - charbuf[2];
6987   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6988
6989   Fput_text_property (make_number (from), make_number (pos),
6990                       Qcharset, CHARSET_NAME (charset),
6991                       coding->dst_object);
6992 }
6993
6994
6995 #define CHARBUF_SIZE 0x4000
6996
6997 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6998   do {                                                                  \
6999     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7000     coding->charbuf_size = CHARBUF_SIZE;                                \
7001   } while (0)
7002
7003
7004 static void
7005 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7006 {
7007   int *charbuf = coding->charbuf;
7008   int *charbuf_end = charbuf + coding->charbuf_used;
7009
7010   if (NILP (coding->dst_object))
7011     return;
7012
7013   while (charbuf < charbuf_end)
7014     {
7015       if (*charbuf >= 0)
7016         pos++, charbuf++;
7017       else
7018         {
7019           int len = -*charbuf;
7020
7021           if (len > 2)
7022             switch (charbuf[1])
7023               {
7024               case CODING_ANNOTATE_COMPOSITION_MASK:
7025                 produce_composition (coding, charbuf, pos);
7026                 break;
7027               case CODING_ANNOTATE_CHARSET_MASK:
7028                 produce_charset (coding, charbuf, pos);
7029                 break;
7030               }
7031           charbuf += len;
7032         }
7033     }
7034 }
7035
7036 /* Decode the data at CODING->src_object into CODING->dst_object.
7037    CODING->src_object is a buffer, a string, or nil.
7038    CODING->dst_object is a buffer.
7039
7040    If CODING->src_object is a buffer, it must be the current buffer.
7041    In this case, if CODING->src_pos is positive, it is a position of
7042    the source text in the buffer, otherwise, the source text is in the
7043    gap area of the buffer, and CODING->src_pos specifies the offset of
7044    the text from GPT (which must be the same as PT).  If this is the
7045    same buffer as CODING->dst_object, CODING->src_pos must be
7046    negative.
7047
7048    If CODING->src_object is a string, CODING->src_pos is an index to
7049    that string.
7050
7051    If CODING->src_object is nil, CODING->source must already point to
7052    the non-relocatable memory area.  In this case, CODING->src_pos is
7053    an offset from CODING->source.
7054
7055    The decoded data is inserted at the current point of the buffer
7056    CODING->dst_object.
7057 */
7058
7059 static void
7060 decode_coding (struct coding_system *coding)
7061 {
7062   Lisp_Object attrs;
7063   Lisp_Object undo_list;
7064   Lisp_Object translation_table;
7065   struct ccl_spec cclspec;
7066   int carryover;
7067   int i;
7068
7069   USE_SAFE_ALLOCA;
7070
7071   if (BUFFERP (coding->src_object)
7072       && coding->src_pos > 0
7073       && coding->src_pos < GPT
7074       && coding->src_pos + coding->src_chars > GPT)
7075     move_gap_both (coding->src_pos, coding->src_pos_byte);
7076
7077   undo_list = Qt;
7078   if (BUFFERP (coding->dst_object))
7079     {
7080       set_buffer_internal (XBUFFER (coding->dst_object));
7081       if (GPT != PT)
7082         move_gap_both (PT, PT_BYTE);
7083
7084       /* We must disable undo_list in order to record the whole insert
7085          transaction via record_insert at the end.  But doing so also
7086          disables the recording of the first change to the undo_list.
7087          Therefore we check for first change here and record it via
7088          record_first_change if needed.  */
7089       if (MODIFF <= SAVE_MODIFF)
7090         record_first_change ();
7091
7092       undo_list = BVAR (current_buffer, undo_list);
7093       bset_undo_list (current_buffer, Qt);
7094     }
7095
7096   coding->consumed = coding->consumed_char = 0;
7097   coding->produced = coding->produced_char = 0;
7098   coding->chars_at_source = 0;
7099   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7100   coding->errors = 0;
7101
7102   ALLOC_CONVERSION_WORK_AREA (coding);
7103
7104   attrs = CODING_ID_ATTRS (coding->id);
7105   translation_table = get_translation_table (attrs, 0, NULL);
7106
7107   carryover = 0;
7108   if (coding->decoder == decode_coding_ccl)
7109     {
7110       coding->spec.ccl = &cclspec;
7111       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7112     }
7113   do
7114     {
7115       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7116
7117       coding_set_source (coding);
7118       coding->annotated = 0;
7119       coding->charbuf_used = carryover;
7120       (*(coding->decoder)) (coding);
7121       coding_set_destination (coding);
7122       carryover = produce_chars (coding, translation_table, 0);
7123       if (coding->annotated)
7124         produce_annotation (coding, pos);
7125       for (i = 0; i < carryover; i++)
7126         coding->charbuf[i]
7127           = coding->charbuf[coding->charbuf_used - carryover + i];
7128     }
7129   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7130          || (coding->consumed < coding->src_bytes
7131              && (coding->result == CODING_RESULT_SUCCESS
7132                  || coding->result == CODING_RESULT_INVALID_SRC)));
7133
7134   if (carryover > 0)
7135     {
7136       coding_set_destination (coding);
7137       coding->charbuf_used = carryover;
7138       produce_chars (coding, translation_table, 1);
7139     }
7140
7141   coding->carryover_bytes = 0;
7142   if (coding->consumed < coding->src_bytes)
7143     {
7144       int nbytes = coding->src_bytes - coding->consumed;
7145       const unsigned char *src;
7146
7147       coding_set_source (coding);
7148       coding_set_destination (coding);
7149       src = coding->source + coding->consumed;
7150
7151       if (coding->mode & CODING_MODE_LAST_BLOCK)
7152         {
7153           /* Flush out unprocessed data as binary chars.  We are sure
7154              that the number of data is less than the size of
7155              coding->charbuf.  */
7156           coding->charbuf_used = 0;
7157           coding->chars_at_source = 0;
7158
7159           while (nbytes-- > 0)
7160             {
7161               int c = *src++;
7162
7163               if (c & 0x80)
7164                 c = BYTE8_TO_CHAR (c);
7165               coding->charbuf[coding->charbuf_used++] = c;
7166             }
7167           produce_chars (coding, Qnil, 1);
7168         }
7169       else
7170         {
7171           /* Record unprocessed bytes in coding->carryover.  We are
7172              sure that the number of data is less than the size of
7173              coding->carryover.  */
7174           unsigned char *p = coding->carryover;
7175
7176           if (nbytes > sizeof coding->carryover)
7177             nbytes = sizeof coding->carryover;
7178           coding->carryover_bytes = nbytes;
7179           while (nbytes-- > 0)
7180             *p++ = *src++;
7181         }
7182       coding->consumed = coding->src_bytes;
7183     }
7184
7185   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7186       && !inhibit_eol_conversion)
7187     decode_eol (coding);
7188   if (BUFFERP (coding->dst_object))
7189     {
7190       bset_undo_list (current_buffer, undo_list);
7191       record_insert (coding->dst_pos, coding->produced_char);
7192     }
7193
7194   SAFE_FREE ();
7195 }
7196
7197
7198 /* Extract an annotation datum from a composition starting at POS and
7199    ending before LIMIT of CODING->src_object (buffer or string), store
7200    the data in BUF, set *STOP to a starting position of the next
7201    composition (if any) or to LIMIT, and return the address of the
7202    next element of BUF.
7203
7204    If such an annotation is not found, set *STOP to a starting
7205    position of a composition after POS (if any) or to LIMIT, and
7206    return BUF.  */
7207
7208 static int *
7209 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7210                                struct coding_system *coding, int *buf,
7211                                ptrdiff_t *stop)
7212 {
7213   ptrdiff_t start, end;
7214   Lisp_Object prop;
7215
7216   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7217       || end > limit)
7218     *stop = limit;
7219   else if (start > pos)
7220     *stop = start;
7221   else
7222     {
7223       if (start == pos)
7224         {
7225           /* We found a composition.  Store the corresponding
7226              annotation data in BUF.  */
7227           int *head = buf;
7228           enum composition_method method = COMPOSITION_METHOD (prop);
7229           int nchars = COMPOSITION_LENGTH (prop);
7230
7231           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7232           if (method != COMPOSITION_RELATIVE)
7233             {
7234               Lisp_Object components;
7235               ptrdiff_t i, len, i_byte;
7236
7237               components = COMPOSITION_COMPONENTS (prop);
7238               if (VECTORP (components))
7239                 {
7240                   len = ASIZE (components);
7241                   for (i = 0; i < len; i++)
7242                     *buf++ = XINT (AREF (components, i));
7243                 }
7244               else if (STRINGP (components))
7245                 {
7246                   len = SCHARS (components);
7247                   i = i_byte = 0;
7248                   while (i < len)
7249                     {
7250                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7251                       buf++;
7252                     }
7253                 }
7254               else if (INTEGERP (components))
7255                 {
7256                   len = 1;
7257                   *buf++ = XINT (components);
7258                 }
7259               else if (CONSP (components))
7260                 {
7261                   for (len = 0; CONSP (components);
7262                        len++, components = XCDR (components))
7263                     *buf++ = XINT (XCAR (components));
7264                 }
7265               else
7266                 emacs_abort ();
7267               *head -= len;
7268             }
7269         }
7270
7271       if (find_composition (end, limit, &start, &end, &prop,
7272                             coding->src_object)
7273           && end <= limit)
7274         *stop = start;
7275       else
7276         *stop = limit;
7277     }
7278   return buf;
7279 }
7280
7281
7282 /* Extract an annotation datum from a text property `charset' at POS of
7283    CODING->src_object (buffer of string), store the data in BUF, set
7284    *STOP to the position where the value of `charset' property changes
7285    (limiting by LIMIT), and return the address of the next element of
7286    BUF.
7287
7288    If the property value is nil, set *STOP to the position where the
7289    property value is non-nil (limiting by LIMIT), and return BUF.  */
7290
7291 static int *
7292 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7293                            struct coding_system *coding, int *buf,
7294                            ptrdiff_t *stop)
7295 {
7296   Lisp_Object val, next;
7297   int id;
7298
7299   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7300   if (! NILP (val) && CHARSETP (val))
7301     id = XINT (CHARSET_SYMBOL_ID (val));
7302   else
7303     id = -1;
7304   ADD_CHARSET_DATA (buf, 0, id);
7305   next = Fnext_single_property_change (make_number (pos), Qcharset,
7306                                        coding->src_object,
7307                                        make_number (limit));
7308   *stop = XINT (next);
7309   return buf;
7310 }
7311
7312
7313 static void
7314 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7315                int max_lookup)
7316 {
7317   int *buf = coding->charbuf;
7318   int *buf_end = coding->charbuf + coding->charbuf_size;
7319   const unsigned char *src = coding->source + coding->consumed;
7320   const unsigned char *src_end = coding->source + coding->src_bytes;
7321   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7322   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7323   bool multibytep = coding->src_multibyte;
7324   Lisp_Object eol_type;
7325   int c;
7326   ptrdiff_t stop, stop_composition, stop_charset;
7327   int *lookup_buf = NULL;
7328
7329   if (! NILP (translation_table))
7330     lookup_buf = alloca (sizeof (int) * max_lookup);
7331
7332   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7333   if (VECTORP (eol_type))
7334     eol_type = Qunix;
7335
7336   /* Note: composition handling is not yet implemented.  */
7337   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7338
7339   if (NILP (coding->src_object))
7340     stop = stop_composition = stop_charset = end_pos;
7341   else
7342     {
7343       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7344         stop = stop_composition = pos;
7345       else
7346         stop = stop_composition = end_pos;
7347       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7348         stop = stop_charset = pos;
7349       else
7350         stop_charset = end_pos;
7351     }
7352
7353   /* Compensate for CRLF and conversion.  */
7354   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7355   while (buf < buf_end)
7356     {
7357       Lisp_Object trans;
7358
7359       if (pos == stop)
7360         {
7361           if (pos == end_pos)
7362             break;
7363           if (pos == stop_composition)
7364             buf = handle_composition_annotation (pos, end_pos, coding,
7365                                                  buf, &stop_composition);
7366           if (pos == stop_charset)
7367             buf = handle_charset_annotation (pos, end_pos, coding,
7368                                              buf, &stop_charset);
7369           stop = (stop_composition < stop_charset
7370                   ? stop_composition : stop_charset);
7371         }
7372
7373       if (! multibytep)
7374         {
7375           int bytes;
7376
7377           if (coding->encoder == encode_coding_raw_text
7378               || coding->encoder == encode_coding_ccl)
7379             c = *src++, pos++;
7380           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7381             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7382           else
7383             c = BYTE8_TO_CHAR (*src), src++, pos++;
7384         }
7385       else
7386         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7387       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7388         c = '\n';
7389       if (! EQ (eol_type, Qunix))
7390         {
7391           if (c == '\n')
7392             {
7393               if (EQ (eol_type, Qdos))
7394                 *buf++ = '\r';
7395               else
7396                 c = '\r';
7397             }
7398         }
7399
7400       trans = Qnil;
7401       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7402       if (NILP (trans))
7403         *buf++ = c;
7404       else
7405         {
7406           ptrdiff_t from_nchars = 1, to_nchars = 1;
7407           int *lookup_buf_end;
7408           const unsigned char *p = src;
7409           int i;
7410
7411           lookup_buf[0] = c;
7412           for (i = 1; i < max_lookup && p < src_end; i++)
7413             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7414           lookup_buf_end = lookup_buf + i;
7415           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7416           if (INTEGERP (trans))
7417             c = XINT (trans);
7418           else if (CONSP (trans))
7419             {
7420               from_nchars = ASIZE (XCAR (trans));
7421               trans = XCDR (trans);
7422               if (INTEGERP (trans))
7423                 c = XINT (trans);
7424               else
7425                 {
7426                   to_nchars = ASIZE (trans);
7427                   if (buf_end - buf < to_nchars)
7428                     break;
7429                   c = XINT (AREF (trans, 0));
7430                 }
7431             }
7432           else
7433             break;
7434           *buf++ = c;
7435           for (i = 1; i < to_nchars; i++)
7436             *buf++ = XINT (AREF (trans, i));
7437           for (i = 1; i < from_nchars; i++, pos++)
7438             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7439         }
7440     }
7441
7442   coding->consumed = src - coding->source;
7443   coding->consumed_char = pos - coding->src_pos;
7444   coding->charbuf_used = buf - coding->charbuf;
7445   coding->chars_at_source = 0;
7446 }
7447
7448
7449 /* Encode the text at CODING->src_object into CODING->dst_object.
7450    CODING->src_object is a buffer or a string.
7451    CODING->dst_object is a buffer or nil.
7452
7453    If CODING->src_object is a buffer, it must be the current buffer.
7454    In this case, if CODING->src_pos is positive, it is a position of
7455    the source text in the buffer, otherwise. the source text is in the
7456    gap area of the buffer, and coding->src_pos specifies the offset of
7457    the text from GPT (which must be the same as PT).  If this is the
7458    same buffer as CODING->dst_object, CODING->src_pos must be
7459    negative and CODING should not have `pre-write-conversion'.
7460
7461    If CODING->src_object is a string, CODING should not have
7462    `pre-write-conversion'.
7463
7464    If CODING->dst_object is a buffer, the encoded data is inserted at
7465    the current point of that buffer.
7466
7467    If CODING->dst_object is nil, the encoded data is placed at the
7468    memory area specified by CODING->destination.  */
7469
7470 static void
7471 encode_coding (struct coding_system *coding)
7472 {
7473   Lisp_Object attrs;
7474   Lisp_Object translation_table;
7475   int max_lookup;
7476   struct ccl_spec cclspec;
7477
7478   USE_SAFE_ALLOCA;
7479
7480   attrs = CODING_ID_ATTRS (coding->id);
7481   if (coding->encoder == encode_coding_raw_text)
7482     translation_table = Qnil, max_lookup = 0;
7483   else
7484     translation_table = get_translation_table (attrs, 1, &max_lookup);
7485
7486   if (BUFFERP (coding->dst_object))
7487     {
7488       set_buffer_internal (XBUFFER (coding->dst_object));
7489       coding->dst_multibyte
7490         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7491     }
7492
7493   coding->consumed = coding->consumed_char = 0;
7494   coding->produced = coding->produced_char = 0;
7495   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7496   coding->errors = 0;
7497
7498   ALLOC_CONVERSION_WORK_AREA (coding);
7499
7500   if (coding->encoder == encode_coding_ccl)
7501     {
7502       coding->spec.ccl = &cclspec;
7503       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7504     }
7505   do {
7506     coding_set_source (coding);
7507     consume_chars (coding, translation_table, max_lookup);
7508     coding_set_destination (coding);
7509     (*(coding->encoder)) (coding);
7510   } while (coding->consumed_char < coding->src_chars);
7511
7512   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7513     insert_from_gap (coding->produced_char, coding->produced, 0);
7514
7515   SAFE_FREE ();
7516 }
7517
7518
7519 /* Name (or base name) of work buffer for code conversion.  */
7520 static Lisp_Object Vcode_conversion_workbuf_name;
7521
7522 /* A working buffer used by the top level conversion.  Once it is
7523    created, it is never destroyed.  It has the name
7524    Vcode_conversion_workbuf_name.  The other working buffers are
7525    destroyed after the use is finished, and their names are modified
7526    versions of Vcode_conversion_workbuf_name.  */
7527 static Lisp_Object Vcode_conversion_reused_workbuf;
7528
7529 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7530 static bool reused_workbuf_in_use;
7531
7532
7533 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7534    multibyteness of returning buffer.  */
7535
7536 static Lisp_Object
7537 make_conversion_work_buffer (bool multibyte)
7538 {
7539   Lisp_Object name, workbuf;
7540   struct buffer *current;
7541
7542   if (reused_workbuf_in_use)
7543     {
7544       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7545       workbuf = Fget_buffer_create (name);
7546     }
7547   else
7548     {
7549       reused_workbuf_in_use = 1;
7550       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7551         Vcode_conversion_reused_workbuf
7552           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7553       workbuf = Vcode_conversion_reused_workbuf;
7554     }
7555   current = current_buffer;
7556   set_buffer_internal (XBUFFER (workbuf));
7557   /* We can't allow modification hooks to run in the work buffer.  For
7558      instance, directory_files_internal assumes that file decoding
7559      doesn't compile new regexps.  */
7560   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7561   Ferase_buffer ();
7562   bset_undo_list (current_buffer, Qt);
7563   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7564   set_buffer_internal (current);
7565   return workbuf;
7566 }
7567
7568
7569 static Lisp_Object
7570 code_conversion_restore (Lisp_Object arg)
7571 {
7572   Lisp_Object current, workbuf;
7573   struct gcpro gcpro1;
7574
7575   GCPRO1 (arg);
7576   current = XCAR (arg);
7577   workbuf = XCDR (arg);
7578   if (! NILP (workbuf))
7579     {
7580       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7581         reused_workbuf_in_use = 0;
7582       else
7583         Fkill_buffer (workbuf);
7584     }
7585   set_buffer_internal (XBUFFER (current));
7586   UNGCPRO;
7587   return Qnil;
7588 }
7589
7590 Lisp_Object
7591 code_conversion_save (bool with_work_buf, bool multibyte)
7592 {
7593   Lisp_Object workbuf = Qnil;
7594
7595   if (with_work_buf)
7596     workbuf = make_conversion_work_buffer (multibyte);
7597   record_unwind_protect (code_conversion_restore,
7598                          Fcons (Fcurrent_buffer (), workbuf));
7599   return workbuf;
7600 }
7601
7602 void
7603 decode_coding_gap (struct coding_system *coding,
7604                    ptrdiff_t chars, ptrdiff_t bytes)
7605 {
7606   ptrdiff_t count = SPECPDL_INDEX ();
7607   Lisp_Object attrs;
7608
7609   coding->src_object = Fcurrent_buffer ();
7610   coding->src_chars = chars;
7611   coding->src_bytes = bytes;
7612   coding->src_pos = -chars;
7613   coding->src_pos_byte = -bytes;
7614   coding->src_multibyte = chars < bytes;
7615   coding->dst_object = coding->src_object;
7616   coding->dst_pos = PT;
7617   coding->dst_pos_byte = PT_BYTE;
7618   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7619
7620   if (CODING_REQUIRE_DETECTION (coding))
7621     detect_coding (coding);
7622   attrs = CODING_ID_ATTRS (coding->id);
7623   if (! disable_ascii_optimization)
7624     {
7625       if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7626           && NILP (CODING_ATTR_POST_READ (attrs))
7627           && NILP (get_translation_table (attrs, 0, NULL))
7628           && (coding->head_ascii >= 0 /* We've already called detect_coding */
7629               ? coding->head_ascii == bytes
7630               : detect_ascii (coding)))
7631         {
7632           if (coding->eol_seen == EOL_SEEN_CR)
7633             {
7634               unsigned char *src_end = GAP_END_ADDR;
7635               unsigned char *src = src_end - coding->src_bytes;
7636
7637               while (src < src_end)
7638                 {
7639                   if (*src++ == '\r')
7640                     src[-1] = '\n';
7641                 }
7642             }
7643           else if (coding->eol_seen == EOL_SEEN_CRLF)
7644             {
7645               unsigned char *src = GAP_END_ADDR;
7646               unsigned char *src_beg = src - coding->src_bytes;
7647               unsigned char *dst = src;
7648
7649               while (src_beg < src)
7650                 {
7651                   *--dst = *--src;
7652                   if (*src == '\n')
7653                     src--;
7654                 }
7655               bytes -= dst - src;
7656             }
7657           coding->produced_char = coding->produced = bytes;
7658           insert_from_gap (bytes, bytes, 1);
7659           return;
7660         }
7661     }
7662   code_conversion_save (0, 0);
7663
7664   coding->mode |= CODING_MODE_LAST_BLOCK;
7665   current_buffer->text->inhibit_shrinking = 1;
7666   decode_coding (coding);
7667   current_buffer->text->inhibit_shrinking = 0;
7668
7669   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7670     {
7671       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7672       Lisp_Object val;
7673
7674       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7675       val = call1 (CODING_ATTR_POST_READ (attrs),
7676                    make_number (coding->produced_char));
7677       CHECK_NATNUM (val);
7678       coding->produced_char += Z - prev_Z;
7679       coding->produced += Z_BYTE - prev_Z_BYTE;
7680     }
7681
7682   unbind_to (count, Qnil);
7683 }
7684
7685
7686 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7687    SRC_OBJECT into DST_OBJECT by coding context CODING.
7688
7689    SRC_OBJECT is a buffer, a string, or Qnil.
7690
7691    If it is a buffer, the text is at point of the buffer.  FROM and TO
7692    are positions in the buffer.
7693
7694    If it is a string, the text is at the beginning of the string.
7695    FROM and TO are indices to the string.
7696
7697    If it is nil, the text is at coding->source.  FROM and TO are
7698    indices to coding->source.
7699
7700    DST_OBJECT is a buffer, Qt, or Qnil.
7701
7702    If it is a buffer, the decoded text is inserted at point of the
7703    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7704    is deleted.
7705
7706    If it is Qt, a string is made from the decoded text, and
7707    set in CODING->dst_object.
7708
7709    If it is Qnil, the decoded text is stored at CODING->destination.
7710    The caller must allocate CODING->dst_bytes bytes at
7711    CODING->destination by xmalloc.  If the decoded text is longer than
7712    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7713  */
7714
7715 void
7716 decode_coding_object (struct coding_system *coding,
7717                       Lisp_Object src_object,
7718                       ptrdiff_t from, ptrdiff_t from_byte,
7719                       ptrdiff_t to, ptrdiff_t to_byte,
7720                       Lisp_Object dst_object)
7721 {
7722   ptrdiff_t count = SPECPDL_INDEX ();
7723   unsigned char *destination IF_LINT (= NULL);
7724   ptrdiff_t dst_bytes IF_LINT (= 0);
7725   ptrdiff_t chars = to - from;
7726   ptrdiff_t bytes = to_byte - from_byte;
7727   Lisp_Object attrs;
7728   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7729   bool need_marker_adjustment = 0;
7730   Lisp_Object old_deactivate_mark;
7731
7732   old_deactivate_mark = Vdeactivate_mark;
7733
7734   if (NILP (dst_object))
7735     {
7736       destination = coding->destination;
7737       dst_bytes = coding->dst_bytes;
7738     }
7739
7740   coding->src_object = src_object;
7741   coding->src_chars = chars;
7742   coding->src_bytes = bytes;
7743   coding->src_multibyte = chars < bytes;
7744
7745   if (STRINGP (src_object))
7746     {
7747       coding->src_pos = from;
7748       coding->src_pos_byte = from_byte;
7749     }
7750   else if (BUFFERP (src_object))
7751     {
7752       set_buffer_internal (XBUFFER (src_object));
7753       if (from != GPT)
7754         move_gap_both (from, from_byte);
7755       if (EQ (src_object, dst_object))
7756         {
7757           struct Lisp_Marker *tail;
7758
7759           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7760             {
7761               tail->need_adjustment
7762                 = tail->charpos == (tail->insertion_type ? from : to);
7763               need_marker_adjustment |= tail->need_adjustment;
7764             }
7765           saved_pt = PT, saved_pt_byte = PT_BYTE;
7766           TEMP_SET_PT_BOTH (from, from_byte);
7767           current_buffer->text->inhibit_shrinking = 1;
7768           del_range_both (from, from_byte, to, to_byte, 1);
7769           coding->src_pos = -chars;
7770           coding->src_pos_byte = -bytes;
7771         }
7772       else
7773         {
7774           coding->src_pos = from;
7775           coding->src_pos_byte = from_byte;
7776         }
7777     }
7778
7779   if (CODING_REQUIRE_DETECTION (coding))
7780     detect_coding (coding);
7781   attrs = CODING_ID_ATTRS (coding->id);
7782
7783   if (EQ (dst_object, Qt)
7784       || (! NILP (CODING_ATTR_POST_READ (attrs))
7785           && NILP (dst_object)))
7786     {
7787       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7788       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7789       coding->dst_pos = BEG;
7790       coding->dst_pos_byte = BEG_BYTE;
7791     }
7792   else if (BUFFERP (dst_object))
7793     {
7794       code_conversion_save (0, 0);
7795       coding->dst_object = dst_object;
7796       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7797       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7798       coding->dst_multibyte
7799         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7800     }
7801   else
7802     {
7803       code_conversion_save (0, 0);
7804       coding->dst_object = Qnil;
7805       /* Most callers presume this will return a multibyte result, and they
7806          won't use `binary' or `raw-text' anyway, so let's not worry about
7807          CODING_FOR_UNIBYTE.  */
7808       coding->dst_multibyte = 1;
7809     }
7810
7811   decode_coding (coding);
7812
7813   if (BUFFERP (coding->dst_object))
7814     set_buffer_internal (XBUFFER (coding->dst_object));
7815
7816   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7817     {
7818       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7819       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7820       Lisp_Object val;
7821
7822       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7823       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7824               old_deactivate_mark);
7825       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7826                         make_number (coding->produced_char));
7827       UNGCPRO;
7828       CHECK_NATNUM (val);
7829       coding->produced_char += Z - prev_Z;
7830       coding->produced += Z_BYTE - prev_Z_BYTE;
7831     }
7832
7833   if (EQ (dst_object, Qt))
7834     {
7835       coding->dst_object = Fbuffer_string ();
7836     }
7837   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7838     {
7839       set_buffer_internal (XBUFFER (coding->dst_object));
7840       if (dst_bytes < coding->produced)
7841         {
7842           eassert (coding->produced > 0);
7843           destination = xrealloc (destination, coding->produced);
7844           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7845             move_gap_both (BEGV, BEGV_BYTE);
7846           memcpy (destination, BEGV_ADDR, coding->produced);
7847           coding->destination = destination;
7848         }
7849     }
7850
7851   if (saved_pt >= 0)
7852     {
7853       /* This is the case of:
7854          (BUFFERP (src_object) && EQ (src_object, dst_object))
7855          As we have moved PT while replacing the original buffer
7856          contents, we must recover it now.  */
7857       set_buffer_internal (XBUFFER (src_object));
7858       current_buffer->text->inhibit_shrinking = 0;
7859       if (saved_pt < from)
7860         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7861       else if (saved_pt < from + chars)
7862         TEMP_SET_PT_BOTH (from, from_byte);
7863       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7864         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7865                           saved_pt_byte + (coding->produced - bytes));
7866       else
7867         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7868                           saved_pt_byte + (coding->produced - bytes));
7869
7870       if (need_marker_adjustment)
7871         {
7872           struct Lisp_Marker *tail;
7873
7874           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7875             if (tail->need_adjustment)
7876               {
7877                 tail->need_adjustment = 0;
7878                 if (tail->insertion_type)
7879                   {
7880                     tail->bytepos = from_byte;
7881                     tail->charpos = from;
7882                   }
7883                 else
7884                   {
7885                     tail->bytepos = from_byte + coding->produced;
7886                     tail->charpos
7887                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7888                          ? tail->bytepos : from + coding->produced_char);
7889                   }
7890               }
7891         }
7892     }
7893
7894   Vdeactivate_mark = old_deactivate_mark;
7895   unbind_to (count, coding->dst_object);
7896 }
7897
7898
7899 void
7900 encode_coding_object (struct coding_system *coding,
7901                       Lisp_Object src_object,
7902                       ptrdiff_t from, ptrdiff_t from_byte,
7903                       ptrdiff_t to, ptrdiff_t to_byte,
7904                       Lisp_Object dst_object)
7905 {
7906   ptrdiff_t count = SPECPDL_INDEX ();
7907   ptrdiff_t chars = to - from;
7908   ptrdiff_t bytes = to_byte - from_byte;
7909   Lisp_Object attrs;
7910   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7911   bool need_marker_adjustment = 0;
7912   bool kill_src_buffer = 0;
7913   Lisp_Object old_deactivate_mark;
7914
7915   old_deactivate_mark = Vdeactivate_mark;
7916
7917   coding->src_object = src_object;
7918   coding->src_chars = chars;
7919   coding->src_bytes = bytes;
7920   coding->src_multibyte = chars < bytes;
7921
7922   attrs = CODING_ID_ATTRS (coding->id);
7923
7924   if (EQ (src_object, dst_object))
7925     {
7926       struct Lisp_Marker *tail;
7927
7928       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7929         {
7930           tail->need_adjustment
7931             = tail->charpos == (tail->insertion_type ? from : to);
7932           need_marker_adjustment |= tail->need_adjustment;
7933         }
7934     }
7935
7936   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7937     {
7938       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7939       set_buffer_internal (XBUFFER (coding->src_object));
7940       if (STRINGP (src_object))
7941         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7942       else if (BUFFERP (src_object))
7943         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7944       else
7945         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7946
7947       if (EQ (src_object, dst_object))
7948         {
7949           set_buffer_internal (XBUFFER (src_object));
7950           saved_pt = PT, saved_pt_byte = PT_BYTE;
7951           del_range_both (from, from_byte, to, to_byte, 1);
7952           set_buffer_internal (XBUFFER (coding->src_object));
7953         }
7954
7955       {
7956         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7957
7958         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7959                 old_deactivate_mark);
7960         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7961                     make_number (BEG), make_number (Z));
7962         UNGCPRO;
7963       }
7964       if (XBUFFER (coding->src_object) != current_buffer)
7965         kill_src_buffer = 1;
7966       coding->src_object = Fcurrent_buffer ();
7967       if (BEG != GPT)
7968         move_gap_both (BEG, BEG_BYTE);
7969       coding->src_chars = Z - BEG;
7970       coding->src_bytes = Z_BYTE - BEG_BYTE;
7971       coding->src_pos = BEG;
7972       coding->src_pos_byte = BEG_BYTE;
7973       coding->src_multibyte = Z < Z_BYTE;
7974     }
7975   else if (STRINGP (src_object))
7976     {
7977       code_conversion_save (0, 0);
7978       coding->src_pos = from;
7979       coding->src_pos_byte = from_byte;
7980     }
7981   else if (BUFFERP (src_object))
7982     {
7983       code_conversion_save (0, 0);
7984       set_buffer_internal (XBUFFER (src_object));
7985       if (EQ (src_object, dst_object))
7986         {
7987           saved_pt = PT, saved_pt_byte = PT_BYTE;
7988           coding->src_object = del_range_1 (from, to, 1, 1);
7989           coding->src_pos = 0;
7990           coding->src_pos_byte = 0;
7991         }
7992       else
7993         {
7994           if (from < GPT && to >= GPT)
7995             move_gap_both (from, from_byte);
7996           coding->src_pos = from;
7997           coding->src_pos_byte = from_byte;
7998         }
7999     }
8000   else
8001     code_conversion_save (0, 0);
8002
8003   if (BUFFERP (dst_object))
8004     {
8005       coding->dst_object = dst_object;
8006       if (EQ (src_object, dst_object))
8007         {
8008           coding->dst_pos = from;
8009           coding->dst_pos_byte = from_byte;
8010         }
8011       else
8012         {
8013           struct buffer *current = current_buffer;
8014
8015           set_buffer_temp (XBUFFER (dst_object));
8016           coding->dst_pos = PT;
8017           coding->dst_pos_byte = PT_BYTE;
8018           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8019           set_buffer_temp (current);
8020         }
8021       coding->dst_multibyte
8022         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8023     }
8024   else if (EQ (dst_object, Qt))
8025     {
8026       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8027       coding->dst_object = Qnil;
8028       coding->destination = xmalloc (dst_bytes);
8029       coding->dst_bytes = dst_bytes;
8030       coding->dst_multibyte = 0;
8031     }
8032   else
8033     {
8034       coding->dst_object = Qnil;
8035       coding->dst_multibyte = 0;
8036     }
8037
8038   encode_coding (coding);
8039
8040   if (EQ (dst_object, Qt))
8041     {
8042       if (BUFFERP (coding->dst_object))
8043         coding->dst_object = Fbuffer_string ();
8044       else
8045         {
8046           coding->dst_object
8047             = make_unibyte_string ((char *) coding->destination,
8048                                    coding->produced);
8049           xfree (coding->destination);
8050         }
8051     }
8052
8053   if (saved_pt >= 0)
8054     {
8055       /* This is the case of:
8056          (BUFFERP (src_object) && EQ (src_object, dst_object))
8057          As we have moved PT while replacing the original buffer
8058          contents, we must recover it now.  */
8059       set_buffer_internal (XBUFFER (src_object));
8060       if (saved_pt < from)
8061         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8062       else if (saved_pt < from + chars)
8063         TEMP_SET_PT_BOTH (from, from_byte);
8064       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8065         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8066                           saved_pt_byte + (coding->produced - bytes));
8067       else
8068         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8069                           saved_pt_byte + (coding->produced - bytes));
8070
8071       if (need_marker_adjustment)
8072         {
8073           struct Lisp_Marker *tail;
8074
8075           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8076             if (tail->need_adjustment)
8077               {
8078                 tail->need_adjustment = 0;
8079                 if (tail->insertion_type)
8080                   {
8081                     tail->bytepos = from_byte;
8082                     tail->charpos = from;
8083                   }
8084                 else
8085                   {
8086                     tail->bytepos = from_byte + coding->produced;
8087                     tail->charpos
8088                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8089                          ? tail->bytepos : from + coding->produced_char);
8090                   }
8091               }
8092         }
8093     }
8094
8095   if (kill_src_buffer)
8096     Fkill_buffer (coding->src_object);
8097
8098   Vdeactivate_mark = old_deactivate_mark;
8099   unbind_to (count, Qnil);
8100 }
8101
8102
8103 Lisp_Object
8104 preferred_coding_system (void)
8105 {
8106   int id = coding_categories[coding_priorities[0]].id;
8107
8108   return CODING_ID_NAME (id);
8109 }
8110
8111 #if defined (WINDOWSNT) || defined (CYGWIN)
8112
8113 Lisp_Object
8114 from_unicode (Lisp_Object str)
8115 {
8116   CHECK_STRING (str);
8117   if (!STRING_MULTIBYTE (str) &&
8118       SBYTES (str) & 1)
8119     {
8120       str = Fsubstring (str, make_number (0), make_number (-1));
8121     }
8122
8123   return code_convert_string_norecord (str, Qutf_16le, 0);
8124 }
8125
8126 Lisp_Object
8127 from_unicode_buffer (const wchar_t* wstr)
8128 {
8129     return from_unicode (
8130         make_unibyte_string (
8131             (char*) wstr,
8132             /* we get one of the two final 0 bytes for free. */
8133             1 + sizeof (wchar_t) * wcslen (wstr)));
8134 }
8135
8136 wchar_t *
8137 to_unicode (Lisp_Object str, Lisp_Object *buf)
8138 {
8139   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8140   /* We need to make another copy (in addition to the one made by
8141      code_convert_string_norecord) to ensure that the final string is
8142      _doubly_ zero terminated --- that is, that the string is
8143      terminated by two zero bytes and one utf-16le null character.
8144      Because strings are already terminated with a single zero byte,
8145      we just add one additional zero. */
8146   str = make_uninit_string (SBYTES (*buf) + 1);
8147   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8148   SDATA (str) [SBYTES (*buf)] = '\0';
8149   *buf = str;
8150   return WCSDATA (*buf);
8151 }
8152
8153 #endif /* WINDOWSNT || CYGWIN */
8154
8155 \f
8156 #ifdef emacs
8157 /*** 8. Emacs Lisp library functions ***/
8158
8159 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8160        doc: /* Return t if OBJECT is nil or a coding-system.
8161 See the documentation of `define-coding-system' for information
8162 about coding-system objects.  */)
8163   (Lisp_Object object)
8164 {
8165   if (NILP (object)
8166       || CODING_SYSTEM_ID (object) >= 0)
8167     return Qt;
8168   if (! SYMBOLP (object)
8169       || NILP (Fget (object, Qcoding_system_define_form)))
8170     return Qnil;
8171   return Qt;
8172 }
8173
8174 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8175        Sread_non_nil_coding_system, 1, 1, 0,
8176        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8177   (Lisp_Object prompt)
8178 {
8179   Lisp_Object val;
8180   do
8181     {
8182       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8183                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8184     }
8185   while (SCHARS (val) == 0);
8186   return (Fintern (val, Qnil));
8187 }
8188
8189 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8190        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8191 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8192 Ignores case when completing coding systems (all Emacs coding systems
8193 are lower-case).  */)
8194   (Lisp_Object prompt, Lisp_Object default_coding_system)
8195 {
8196   Lisp_Object val;
8197   ptrdiff_t count = SPECPDL_INDEX ();
8198
8199   if (SYMBOLP (default_coding_system))
8200     default_coding_system = SYMBOL_NAME (default_coding_system);
8201   specbind (Qcompletion_ignore_case, Qt);
8202   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8203                           Qt, Qnil, Qcoding_system_history,
8204                           default_coding_system, Qnil);
8205   unbind_to (count, Qnil);
8206   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8207 }
8208
8209 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8210        1, 1, 0,
8211        doc: /* Check validity of CODING-SYSTEM.
8212 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8213 It is valid if it is nil or a symbol defined as a coding system by the
8214 function `define-coding-system'.  */)
8215   (Lisp_Object coding_system)
8216 {
8217   Lisp_Object define_form;
8218
8219   define_form = Fget (coding_system, Qcoding_system_define_form);
8220   if (! NILP (define_form))
8221     {
8222       Fput (coding_system, Qcoding_system_define_form, Qnil);
8223       safe_eval (define_form);
8224     }
8225   if (!NILP (Fcoding_system_p (coding_system)))
8226     return coding_system;
8227   xsignal1 (Qcoding_system_error, coding_system);
8228 }
8229
8230 \f
8231 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8232    HIGHEST, return the coding system of the highest
8233    priority among the detected coding systems.  Otherwise return a
8234    list of detected coding systems sorted by their priorities.  If
8235    MULTIBYTEP, it is assumed that the bytes are in correct
8236    multibyte form but contains only ASCII and eight-bit chars.
8237    Otherwise, the bytes are raw bytes.
8238
8239    CODING-SYSTEM controls the detection as below:
8240
8241    If it is nil, detect both text-format and eol-format.  If the
8242    text-format part of CODING-SYSTEM is already specified
8243    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8244    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8245    detect only text-format.  */
8246
8247 Lisp_Object
8248 detect_coding_system (const unsigned char *src,
8249                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8250                       bool highest, bool multibytep,
8251                       Lisp_Object coding_system)
8252 {
8253   const unsigned char *src_end = src + src_bytes;
8254   Lisp_Object attrs, eol_type;
8255   Lisp_Object val = Qnil;
8256   struct coding_system coding;
8257   ptrdiff_t id;
8258   struct coding_detection_info detect_info;
8259   enum coding_category base_category;
8260   bool null_byte_found = 0, eight_bit_found = 0;
8261
8262   if (NILP (coding_system))
8263     coding_system = Qundecided;
8264   setup_coding_system (coding_system, &coding);
8265   attrs = CODING_ID_ATTRS (coding.id);
8266   eol_type = CODING_ID_EOL_TYPE (coding.id);
8267   coding_system = CODING_ATTR_BASE_NAME (attrs);
8268
8269   coding.source = src;
8270   coding.src_chars = src_chars;
8271   coding.src_bytes = src_bytes;
8272   coding.src_multibyte = multibytep;
8273   coding.consumed = 0;
8274   coding.mode |= CODING_MODE_LAST_BLOCK;
8275   coding.head_ascii = 0;
8276
8277   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8278
8279   /* At first, detect text-format if necessary.  */
8280   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8281   if (base_category == coding_category_undecided)
8282     {
8283       enum coding_category category IF_LINT (= 0);
8284       struct coding_system *this IF_LINT (= NULL);
8285       int c, i;
8286
8287       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8288       for (; src < src_end; src++)
8289         {
8290           c = *src;
8291           if (c & 0x80)
8292             {
8293               eight_bit_found = 1;
8294               if (null_byte_found)
8295                 break;
8296             }
8297           else if (c < 0x20)
8298             {
8299               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8300                   && ! inhibit_iso_escape_detection
8301                   && ! detect_info.checked)
8302                 {
8303                   if (detect_coding_iso_2022 (&coding, &detect_info))
8304                     {
8305                       /* We have scanned the whole data.  */
8306                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8307                         {
8308                           /* We didn't find an 8-bit code.  We may
8309                              have found a null-byte, but it's very
8310                              rare that a binary file confirm to
8311                              ISO-2022.  */
8312                           src = src_end;
8313                           coding.head_ascii = src - coding.source;
8314                         }
8315                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8316                       break;
8317                     }
8318                 }
8319               else if (! c && !inhibit_null_byte_detection)
8320                 {
8321                   null_byte_found = 1;
8322                   if (eight_bit_found)
8323                     break;
8324                 }
8325               if (! eight_bit_found)
8326                 coding.head_ascii++;
8327             }
8328           else if (! eight_bit_found)
8329             coding.head_ascii++;
8330         }
8331
8332       if (null_byte_found || eight_bit_found
8333           || coding.head_ascii < coding.src_bytes
8334           || detect_info.found)
8335         {
8336           if (coding.head_ascii == coding.src_bytes)
8337             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8338             for (i = 0; i < coding_category_raw_text; i++)
8339               {
8340                 category = coding_priorities[i];
8341                 this = coding_categories + category;
8342                 if (detect_info.found & (1 << category))
8343                   break;
8344               }
8345           else
8346             {
8347               if (null_byte_found)
8348                 {
8349                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8350                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8351                 }
8352               for (i = 0; i < coding_category_raw_text; i++)
8353                 {
8354                   category = coding_priorities[i];
8355                   this = coding_categories + category;
8356
8357                   if (this->id < 0)
8358                     {
8359                       /* No coding system of this category is defined.  */
8360                       detect_info.rejected |= (1 << category);
8361                     }
8362                   else if (category >= coding_category_raw_text)
8363                     continue;
8364                   else if (detect_info.checked & (1 << category))
8365                     {
8366                       if (highest
8367                           && (detect_info.found & (1 << category)))
8368                         break;
8369                     }
8370                   else if ((*(this->detector)) (&coding, &detect_info)
8371                            && highest
8372                            && (detect_info.found & (1 << category)))
8373                     {
8374                       if (category == coding_category_utf_16_auto)
8375                         {
8376                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8377                             category = coding_category_utf_16_le;
8378                           else
8379                             category = coding_category_utf_16_be;
8380                         }
8381                       break;
8382                     }
8383                 }
8384             }
8385         }
8386
8387       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8388           || null_byte_found)
8389         {
8390           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8391           id = CODING_SYSTEM_ID (Qno_conversion);
8392           val = Fcons (make_number (id), Qnil);
8393         }
8394       else if (! detect_info.rejected && ! detect_info.found)
8395         {
8396           detect_info.found = CATEGORY_MASK_ANY;
8397           id = coding_categories[coding_category_undecided].id;
8398           val = Fcons (make_number (id), Qnil);
8399         }
8400       else if (highest)
8401         {
8402           if (detect_info.found)
8403             {
8404               detect_info.found = 1 << category;
8405               val = Fcons (make_number (this->id), Qnil);
8406             }
8407           else
8408             for (i = 0; i < coding_category_raw_text; i++)
8409               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8410                 {
8411                   detect_info.found = 1 << coding_priorities[i];
8412                   id = coding_categories[coding_priorities[i]].id;
8413                   val = Fcons (make_number (id), Qnil);
8414                   break;
8415                 }
8416         }
8417       else
8418         {
8419           int mask = detect_info.rejected | detect_info.found;
8420           int found = 0;
8421
8422           for (i = coding_category_raw_text - 1; i >= 0; i--)
8423             {
8424               category = coding_priorities[i];
8425               if (! (mask & (1 << category)))
8426                 {
8427                   found |= 1 << category;
8428                   id = coding_categories[category].id;
8429                   if (id >= 0)
8430                     val = Fcons (make_number (id), val);
8431                 }
8432             }
8433           for (i = coding_category_raw_text - 1; i >= 0; i--)
8434             {
8435               category = coding_priorities[i];
8436               if (detect_info.found & (1 << category))
8437                 {
8438                   id = coding_categories[category].id;
8439                   val = Fcons (make_number (id), val);
8440                 }
8441             }
8442           detect_info.found |= found;
8443         }
8444     }
8445   else if (base_category == coding_category_utf_8_auto)
8446     {
8447       if (detect_coding_utf_8 (&coding, &detect_info))
8448         {
8449           struct coding_system *this;
8450
8451           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8452             this = coding_categories + coding_category_utf_8_sig;
8453           else
8454             this = coding_categories + coding_category_utf_8_nosig;
8455           val = Fcons (make_number (this->id), Qnil);
8456         }
8457     }
8458   else if (base_category == coding_category_utf_16_auto)
8459     {
8460       if (detect_coding_utf_16 (&coding, &detect_info))
8461         {
8462           struct coding_system *this;
8463
8464           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8465             this = coding_categories + coding_category_utf_16_le;
8466           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8467             this = coding_categories + coding_category_utf_16_be;
8468           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8469             this = coding_categories + coding_category_utf_16_be_nosig;
8470           else
8471             this = coding_categories + coding_category_utf_16_le_nosig;
8472           val = Fcons (make_number (this->id), Qnil);
8473         }
8474     }
8475   else
8476     {
8477       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8478       val = Fcons (make_number (coding.id), Qnil);
8479     }
8480
8481   /* Then, detect eol-format if necessary.  */
8482   {
8483     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8484     Lisp_Object tail;
8485
8486     if (VECTORP (eol_type))
8487       {
8488         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8489           {
8490             if (null_byte_found)
8491               normal_eol = EOL_SEEN_LF;
8492             else
8493               normal_eol = detect_eol (coding.source, src_bytes,
8494                                        coding_category_raw_text);
8495           }
8496         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8497                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8498           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8499                                       coding_category_utf_16_be);
8500         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8501                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8502           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8503                                       coding_category_utf_16_le);
8504       }
8505     else
8506       {
8507         if (EQ (eol_type, Qunix))
8508           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8509         else if (EQ (eol_type, Qdos))
8510           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8511         else
8512           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8513       }
8514
8515     for (tail = val; CONSP (tail); tail = XCDR (tail))
8516       {
8517         enum coding_category category;
8518         int this_eol;
8519
8520         id = XINT (XCAR (tail));
8521         attrs = CODING_ID_ATTRS (id);
8522         category = XINT (CODING_ATTR_CATEGORY (attrs));
8523         eol_type = CODING_ID_EOL_TYPE (id);
8524         if (VECTORP (eol_type))
8525           {
8526             if (category == coding_category_utf_16_be
8527                 || category == coding_category_utf_16_be_nosig)
8528               this_eol = utf_16_be_eol;
8529             else if (category == coding_category_utf_16_le
8530                      || category == coding_category_utf_16_le_nosig)
8531               this_eol = utf_16_le_eol;
8532             else
8533               this_eol = normal_eol;
8534
8535             if (this_eol == EOL_SEEN_LF)
8536               XSETCAR (tail, AREF (eol_type, 0));
8537             else if (this_eol == EOL_SEEN_CRLF)
8538               XSETCAR (tail, AREF (eol_type, 1));
8539             else if (this_eol == EOL_SEEN_CR)
8540               XSETCAR (tail, AREF (eol_type, 2));
8541             else
8542               XSETCAR (tail, CODING_ID_NAME (id));
8543           }
8544         else
8545           XSETCAR (tail, CODING_ID_NAME (id));
8546       }
8547   }
8548
8549   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8550 }
8551
8552
8553 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8554        2, 3, 0,
8555        doc: /* Detect coding system of the text in the region between START and END.
8556 Return a list of possible coding systems ordered by priority.
8557 The coding systems to try and their priorities follows what
8558 the function `coding-system-priority-list' (which see) returns.
8559
8560 If only ASCII characters are found (except for such ISO-2022 control
8561 characters as ESC), it returns a list of single element `undecided'
8562 or its subsidiary coding system according to a detected end-of-line
8563 format.
8564
8565 If optional argument HIGHEST is non-nil, return the coding system of
8566 highest priority.  */)
8567   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8568 {
8569   ptrdiff_t from, to;
8570   ptrdiff_t from_byte, to_byte;
8571
8572   validate_region (&start, &end);
8573   from = XINT (start), to = XINT (end);
8574   from_byte = CHAR_TO_BYTE (from);
8575   to_byte = CHAR_TO_BYTE (to);
8576
8577   if (from < GPT && to >= GPT)
8578     move_gap_both (to, to_byte);
8579
8580   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8581                                to - from, to_byte - from_byte,
8582                                !NILP (highest),
8583                                !NILP (BVAR (current_buffer
8584                                       , enable_multibyte_characters)),
8585                                Qnil);
8586 }
8587
8588 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8589        1, 2, 0,
8590        doc: /* Detect coding system of the text in STRING.
8591 Return a list of possible coding systems ordered by priority.
8592 The coding systems to try and their priorities follows what
8593 the function `coding-system-priority-list' (which see) returns.
8594
8595 If only ASCII characters are found (except for such ISO-2022 control
8596 characters as ESC), it returns a list of single element `undecided'
8597 or its subsidiary coding system according to a detected end-of-line
8598 format.
8599
8600 If optional argument HIGHEST is non-nil, return the coding system of
8601 highest priority.  */)
8602   (Lisp_Object string, Lisp_Object highest)
8603 {
8604   CHECK_STRING (string);
8605
8606   return detect_coding_system (SDATA (string),
8607                                SCHARS (string), SBYTES (string),
8608                                !NILP (highest), STRING_MULTIBYTE (string),
8609                                Qnil);
8610 }
8611
8612
8613 static bool
8614 char_encodable_p (int c, Lisp_Object attrs)
8615 {
8616   Lisp_Object tail;
8617   struct charset *charset;
8618   Lisp_Object translation_table;
8619
8620   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8621   if (! NILP (translation_table))
8622     c = translate_char (translation_table, c);
8623   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8624        CONSP (tail); tail = XCDR (tail))
8625     {
8626       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8627       if (CHAR_CHARSET_P (c, charset))
8628         break;
8629     }
8630   return (! NILP (tail));
8631 }
8632
8633
8634 /* Return a list of coding systems that safely encode the text between
8635    START and END.  If EXCLUDE is non-nil, it is a list of coding
8636    systems not to check.  The returned list doesn't contain any such
8637    coding systems.  In any case, if the text contains only ASCII or is
8638    unibyte, return t.  */
8639
8640 DEFUN ("find-coding-systems-region-internal",
8641        Ffind_coding_systems_region_internal,
8642        Sfind_coding_systems_region_internal, 2, 3, 0,
8643        doc: /* Internal use only.  */)
8644   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8645 {
8646   Lisp_Object coding_attrs_list, safe_codings;
8647   ptrdiff_t start_byte, end_byte;
8648   const unsigned char *p, *pbeg, *pend;
8649   int c;
8650   Lisp_Object tail, elt, work_table;
8651
8652   if (STRINGP (start))
8653     {
8654       if (!STRING_MULTIBYTE (start)
8655           || SCHARS (start) == SBYTES (start))
8656         return Qt;
8657       start_byte = 0;
8658       end_byte = SBYTES (start);
8659     }
8660   else
8661     {
8662       CHECK_NUMBER_COERCE_MARKER (start);
8663       CHECK_NUMBER_COERCE_MARKER (end);
8664       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8665         args_out_of_range (start, end);
8666       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8667         return Qt;
8668       start_byte = CHAR_TO_BYTE (XINT (start));
8669       end_byte = CHAR_TO_BYTE (XINT (end));
8670       if (XINT (end) - XINT (start) == end_byte - start_byte)
8671         return Qt;
8672
8673       if (XINT (start) < GPT && XINT (end) > GPT)
8674         {
8675           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8676             move_gap_both (XINT (start), start_byte);
8677           else
8678             move_gap_both (XINT (end), end_byte);
8679         }
8680     }
8681
8682   coding_attrs_list = Qnil;
8683   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8684     if (NILP (exclude)
8685         || NILP (Fmemq (XCAR (tail), exclude)))
8686       {
8687         Lisp_Object attrs;
8688
8689         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8690         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8691             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8692           {
8693             ASET (attrs, coding_attr_trans_tbl,
8694                   get_translation_table (attrs, 1, NULL));
8695             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8696           }
8697       }
8698
8699   if (STRINGP (start))
8700     p = pbeg = SDATA (start);
8701   else
8702     p = pbeg = BYTE_POS_ADDR (start_byte);
8703   pend = p + (end_byte - start_byte);
8704
8705   while (p < pend && ASCII_BYTE_P (*p)) p++;
8706   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8707
8708   work_table = Fmake_char_table (Qnil, Qnil);
8709   while (p < pend)
8710     {
8711       if (ASCII_BYTE_P (*p))
8712         p++;
8713       else
8714         {
8715           c = STRING_CHAR_ADVANCE (p);
8716           if (!NILP (char_table_ref (work_table, c)))
8717             /* This character was already checked.  Ignore it.  */
8718             continue;
8719
8720           charset_map_loaded = 0;
8721           for (tail = coding_attrs_list; CONSP (tail);)
8722             {
8723               elt = XCAR (tail);
8724               if (NILP (elt))
8725                 tail = XCDR (tail);
8726               else if (char_encodable_p (c, elt))
8727                 tail = XCDR (tail);
8728               else if (CONSP (XCDR (tail)))
8729                 {
8730                   XSETCAR (tail, XCAR (XCDR (tail)));
8731                   XSETCDR (tail, XCDR (XCDR (tail)));
8732                 }
8733               else
8734                 {
8735                   XSETCAR (tail, Qnil);
8736                   tail = XCDR (tail);
8737                 }
8738             }
8739           if (charset_map_loaded)
8740             {
8741               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8742
8743               if (STRINGP (start))
8744                 pbeg = SDATA (start);
8745               else
8746                 pbeg = BYTE_POS_ADDR (start_byte);
8747               p = pbeg + p_offset;
8748               pend = pbeg + pend_offset;
8749             }
8750           char_table_set (work_table, c, Qt);
8751         }
8752     }
8753
8754   safe_codings = list2 (Qraw_text, Qno_conversion);
8755   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8756     if (! NILP (XCAR (tail)))
8757       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8758
8759   return safe_codings;
8760 }
8761
8762
8763 DEFUN ("unencodable-char-position", Funencodable_char_position,
8764        Sunencodable_char_position, 3, 5, 0,
8765        doc: /*
8766 Return position of first un-encodable character in a region.
8767 START and END specify the region and CODING-SYSTEM specifies the
8768 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8769
8770 If optional 4th argument COUNT is non-nil, it specifies at most how
8771 many un-encodable characters to search.  In this case, the value is a
8772 list of positions.
8773
8774 If optional 5th argument STRING is non-nil, it is a string to search
8775 for un-encodable characters.  In that case, START and END are indexes
8776 to the string.  */)
8777   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8778 {
8779   EMACS_INT n;
8780   struct coding_system coding;
8781   Lisp_Object attrs, charset_list, translation_table;
8782   Lisp_Object positions;
8783   ptrdiff_t from, to;
8784   const unsigned char *p, *stop, *pend;
8785   bool ascii_compatible;
8786
8787   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8788   attrs = CODING_ID_ATTRS (coding.id);
8789   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8790     return Qnil;
8791   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8792   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8793   translation_table = get_translation_table (attrs, 1, NULL);
8794
8795   if (NILP (string))
8796     {
8797       validate_region (&start, &end);
8798       from = XINT (start);
8799       to = XINT (end);
8800       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8801           || (ascii_compatible
8802               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8803         return Qnil;
8804       p = CHAR_POS_ADDR (from);
8805       pend = CHAR_POS_ADDR (to);
8806       if (from < GPT && to >= GPT)
8807         stop = GPT_ADDR;
8808       else
8809         stop = pend;
8810     }
8811   else
8812     {
8813       CHECK_STRING (string);
8814       CHECK_NATNUM (start);
8815       CHECK_NATNUM (end);
8816       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8817         args_out_of_range_3 (string, start, end);
8818       from = XINT (start);
8819       to = XINT (end);
8820       if (! STRING_MULTIBYTE (string))
8821         return Qnil;
8822       p = SDATA (string) + string_char_to_byte (string, from);
8823       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8824       if (ascii_compatible && (to - from) == (pend - p))
8825         return Qnil;
8826     }
8827
8828   if (NILP (count))
8829     n = 1;
8830   else
8831     {
8832       CHECK_NATNUM (count);
8833       n = XINT (count);
8834     }
8835
8836   positions = Qnil;
8837   charset_map_loaded = 0;
8838   while (1)
8839     {
8840       int c;
8841
8842       if (ascii_compatible)
8843         while (p < stop && ASCII_BYTE_P (*p))
8844           p++, from++;
8845       if (p >= stop)
8846         {
8847           if (p >= pend)
8848             break;
8849           stop = pend;
8850           p = GAP_END_ADDR;
8851         }
8852
8853       c = STRING_CHAR_ADVANCE (p);
8854       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8855           && ! char_charset (translate_char (translation_table, c),
8856                              charset_list, NULL))
8857         {
8858           positions = Fcons (make_number (from), positions);
8859           n--;
8860           if (n == 0)
8861             break;
8862         }
8863
8864       from++;
8865       if (charset_map_loaded && NILP (string))
8866         {
8867           p = CHAR_POS_ADDR (from);
8868           pend = CHAR_POS_ADDR (to);
8869           if (from < GPT && to >= GPT)
8870             stop = GPT_ADDR;
8871           else
8872             stop = pend;
8873           charset_map_loaded = 0;
8874         }
8875     }
8876
8877   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8878 }
8879
8880
8881 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8882        Scheck_coding_systems_region, 3, 3, 0,
8883        doc: /* Check if the region is encodable by coding systems.
8884
8885 START and END are buffer positions specifying the region.
8886 CODING-SYSTEM-LIST is a list of coding systems to check.
8887
8888 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8889 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8890 whole region, POS0, POS1, ... are buffer positions where non-encodable
8891 characters are found.
8892
8893 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8894 value is nil.
8895
8896 START may be a string.  In that case, check if the string is
8897 encodable, and the value contains indices to the string instead of
8898 buffer positions.  END is ignored.
8899
8900 If the current buffer (or START if it is a string) is unibyte, the value
8901 is nil.  */)
8902   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8903 {
8904   Lisp_Object list;
8905   ptrdiff_t start_byte, end_byte;
8906   ptrdiff_t pos;
8907   const unsigned char *p, *pbeg, *pend;
8908   int c;
8909   Lisp_Object tail, elt, attrs;
8910
8911   if (STRINGP (start))
8912     {
8913       if (!STRING_MULTIBYTE (start)
8914           || SCHARS (start) == SBYTES (start))
8915         return Qnil;
8916       start_byte = 0;
8917       end_byte = SBYTES (start);
8918       pos = 0;
8919     }
8920   else
8921     {
8922       CHECK_NUMBER_COERCE_MARKER (start);
8923       CHECK_NUMBER_COERCE_MARKER (end);
8924       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8925         args_out_of_range (start, end);
8926       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8927         return Qnil;
8928       start_byte = CHAR_TO_BYTE (XINT (start));
8929       end_byte = CHAR_TO_BYTE (XINT (end));
8930       if (XINT (end) - XINT (start) == end_byte - start_byte)
8931         return Qnil;
8932
8933       if (XINT (start) < GPT && XINT (end) > GPT)
8934         {
8935           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8936             move_gap_both (XINT (start), start_byte);
8937           else
8938             move_gap_both (XINT (end), end_byte);
8939         }
8940       pos = XINT (start);
8941     }
8942
8943   list = Qnil;
8944   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8945     {
8946       elt = XCAR (tail);
8947       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8948       ASET (attrs, coding_attr_trans_tbl,
8949             get_translation_table (attrs, 1, NULL));
8950       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8951     }
8952
8953   if (STRINGP (start))
8954     p = pbeg = SDATA (start);
8955   else
8956     p = pbeg = BYTE_POS_ADDR (start_byte);
8957   pend = p + (end_byte - start_byte);
8958
8959   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8960   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8961
8962   while (p < pend)
8963     {
8964       if (ASCII_BYTE_P (*p))
8965         p++;
8966       else
8967         {
8968           c = STRING_CHAR_ADVANCE (p);
8969
8970           charset_map_loaded = 0;
8971           for (tail = list; CONSP (tail); tail = XCDR (tail))
8972             {
8973               elt = XCDR (XCAR (tail));
8974               if (! char_encodable_p (c, XCAR (elt)))
8975                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8976             }
8977           if (charset_map_loaded)
8978             {
8979               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8980
8981               if (STRINGP (start))
8982                 pbeg = SDATA (start);
8983               else
8984                 pbeg = BYTE_POS_ADDR (start_byte);
8985               p = pbeg + p_offset;
8986               pend = pbeg + pend_offset;
8987             }
8988         }
8989       pos++;
8990     }
8991
8992   tail = list;
8993   list = Qnil;
8994   for (; CONSP (tail); tail = XCDR (tail))
8995     {
8996       elt = XCAR (tail);
8997       if (CONSP (XCDR (XCDR (elt))))
8998         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8999                       list);
9000     }
9001
9002   return list;
9003 }
9004
9005
9006 static Lisp_Object
9007 code_convert_region (Lisp_Object start, Lisp_Object end,
9008                      Lisp_Object coding_system, Lisp_Object dst_object,
9009                      bool encodep, bool norecord)
9010 {
9011   struct coding_system coding;
9012   ptrdiff_t from, from_byte, to, to_byte;
9013   Lisp_Object src_object;
9014
9015   if (NILP (coding_system))
9016     coding_system = Qno_conversion;
9017   else
9018     CHECK_CODING_SYSTEM (coding_system);
9019   src_object = Fcurrent_buffer ();
9020   if (NILP (dst_object))
9021     dst_object = src_object;
9022   else if (! EQ (dst_object, Qt))
9023     CHECK_BUFFER (dst_object);
9024
9025   validate_region (&start, &end);
9026   from = XFASTINT (start);
9027   from_byte = CHAR_TO_BYTE (from);
9028   to = XFASTINT (end);
9029   to_byte = CHAR_TO_BYTE (to);
9030
9031   setup_coding_system (coding_system, &coding);
9032   coding.mode |= CODING_MODE_LAST_BLOCK;
9033
9034   if (encodep)
9035     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9036                           dst_object);
9037   else
9038     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9039                           dst_object);
9040   if (! norecord)
9041     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9042
9043   return (BUFFERP (dst_object)
9044           ? make_number (coding.produced_char)
9045           : coding.dst_object);
9046 }
9047
9048
9049 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9050        3, 4, "r\nzCoding system: ",
9051        doc: /* Decode the current region from the specified coding system.
9052 When called from a program, takes four arguments:
9053         START, END, CODING-SYSTEM, and DESTINATION.
9054 START and END are buffer positions.
9055
9056 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9057 If nil, the region between START and END is replaced by the decoded text.
9058 If buffer, the decoded text is inserted in that buffer after point (point
9059 does not move).
9060 In those cases, the length of the decoded text is returned.
9061 If DESTINATION is t, the decoded text is returned.
9062
9063 This function sets `last-coding-system-used' to the precise coding system
9064 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9065 not fully specified.)  */)
9066   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9067 {
9068   return code_convert_region (start, end, coding_system, destination, 0, 0);
9069 }
9070
9071 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9072        3, 4, "r\nzCoding system: ",
9073        doc: /* Encode the current region by specified coding system.
9074 When called from a program, takes four arguments:
9075         START, END, CODING-SYSTEM and DESTINATION.
9076 START and END are buffer positions.
9077
9078 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9079 If nil, the region between START and END is replace by the encoded text.
9080 If buffer, the encoded text is inserted in that buffer after point (point
9081 does not move).
9082 In those cases, the length of the encoded text is returned.
9083 If DESTINATION is t, the encoded text is returned.
9084
9085 This function sets `last-coding-system-used' to the precise coding system
9086 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9087 not fully specified.)  */)
9088   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9089 {
9090   return code_convert_region (start, end, coding_system, destination, 1, 0);
9091 }
9092
9093 Lisp_Object
9094 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9095                      Lisp_Object dst_object, bool encodep, bool nocopy,
9096                      bool norecord)
9097 {
9098   struct coding_system coding;
9099   ptrdiff_t chars, bytes;
9100
9101   CHECK_STRING (string);
9102   if (NILP (coding_system))
9103     {
9104       if (! norecord)
9105         Vlast_coding_system_used = Qno_conversion;
9106       if (NILP (dst_object))
9107         return (nocopy ? Fcopy_sequence (string) : string);
9108     }
9109
9110   if (NILP (coding_system))
9111     coding_system = Qno_conversion;
9112   else
9113     CHECK_CODING_SYSTEM (coding_system);
9114   if (NILP (dst_object))
9115     dst_object = Qt;
9116   else if (! EQ (dst_object, Qt))
9117     CHECK_BUFFER (dst_object);
9118
9119   setup_coding_system (coding_system, &coding);
9120   coding.mode |= CODING_MODE_LAST_BLOCK;
9121   chars = SCHARS (string);
9122   bytes = SBYTES (string);
9123   if (encodep)
9124     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9125   else
9126     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9127   if (! norecord)
9128     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9129
9130   return (BUFFERP (dst_object)
9131           ? make_number (coding.produced_char)
9132           : coding.dst_object);
9133 }
9134
9135
9136 /* Encode or decode STRING according to CODING_SYSTEM.
9137    Do not set Vlast_coding_system_used.
9138
9139    This function is called only from macros DECODE_FILE and
9140    ENCODE_FILE, thus we ignore character composition.  */
9141
9142 Lisp_Object
9143 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9144                               bool encodep)
9145 {
9146   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9147 }
9148
9149
9150 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9151        2, 4, 0,
9152        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9153
9154 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9155 if the decoding operation is trivial.
9156
9157 Optional fourth arg BUFFER non-nil means that the decoded text is
9158 inserted in that buffer after point (point does not move).  In this
9159 case, the return value is the length of the decoded text.
9160
9161 This function sets `last-coding-system-used' to the precise coding system
9162 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9163 not fully specified.)  */)
9164   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9165 {
9166   return code_convert_string (string, coding_system, buffer,
9167                               0, ! NILP (nocopy), 0);
9168 }
9169
9170 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9171        2, 4, 0,
9172        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9173
9174 Optional third arg NOCOPY non-nil means it is OK to return STRING
9175 itself if the encoding operation is trivial.
9176
9177 Optional fourth arg BUFFER non-nil means that the encoded text is
9178 inserted in that buffer after point (point does not move).  In this
9179 case, the return value is the length of the encoded text.
9180
9181 This function sets `last-coding-system-used' to the precise coding system
9182 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9183 not fully specified.)  */)
9184   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9185 {
9186   return code_convert_string (string, coding_system, buffer,
9187                               1, ! NILP (nocopy), 0);
9188 }
9189
9190 \f
9191 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9192        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9193 Return the corresponding character.  */)
9194   (Lisp_Object code)
9195 {
9196   Lisp_Object spec, attrs, val;
9197   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9198   EMACS_INT ch;
9199   int c;
9200
9201   CHECK_NATNUM (code);
9202   ch = XFASTINT (code);
9203   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9204   attrs = AREF (spec, 0);
9205
9206   if (ASCII_BYTE_P (ch)
9207       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9208     return code;
9209
9210   val = CODING_ATTR_CHARSET_LIST (attrs);
9211   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9212   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9213   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9214
9215   if (ch <= 0x7F)
9216     {
9217       c = ch;
9218       charset = charset_roman;
9219     }
9220   else if (ch >= 0xA0 && ch < 0xDF)
9221     {
9222       c = ch - 0x80;
9223       charset = charset_kana;
9224     }
9225   else
9226     {
9227       EMACS_INT c1 = ch >> 8;
9228       int c2 = ch & 0xFF;
9229
9230       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9231           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9232         error ("Invalid code: %"pI"d", ch);
9233       c = ch;
9234       SJIS_TO_JIS (c);
9235       charset = charset_kanji;
9236     }
9237   c = DECODE_CHAR (charset, c);
9238   if (c < 0)
9239     error ("Invalid code: %"pI"d", ch);
9240   return make_number (c);
9241 }
9242
9243
9244 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9245        doc: /* Encode a Japanese character CH to shift_jis encoding.
9246 Return the corresponding code in SJIS.  */)
9247   (Lisp_Object ch)
9248 {
9249   Lisp_Object spec, attrs, charset_list;
9250   int c;
9251   struct charset *charset;
9252   unsigned code;
9253
9254   CHECK_CHARACTER (ch);
9255   c = XFASTINT (ch);
9256   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9257   attrs = AREF (spec, 0);
9258
9259   if (ASCII_CHAR_P (c)
9260       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9261     return ch;
9262
9263   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9264   charset = char_charset (c, charset_list, &code);
9265   if (code == CHARSET_INVALID_CODE (charset))
9266     error ("Can't encode by shift_jis encoding: %c", c);
9267   JIS_TO_SJIS (code);
9268
9269   return make_number (code);
9270 }
9271
9272 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9273        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9274 Return the corresponding character.  */)
9275   (Lisp_Object code)
9276 {
9277   Lisp_Object spec, attrs, val;
9278   struct charset *charset_roman, *charset_big5, *charset;
9279   EMACS_INT ch;
9280   int c;
9281
9282   CHECK_NATNUM (code);
9283   ch = XFASTINT (code);
9284   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9285   attrs = AREF (spec, 0);
9286
9287   if (ASCII_BYTE_P (ch)
9288       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9289     return code;
9290
9291   val = CODING_ATTR_CHARSET_LIST (attrs);
9292   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9293   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9294
9295   if (ch <= 0x7F)
9296     {
9297       c = ch;
9298       charset = charset_roman;
9299     }
9300   else
9301     {
9302       EMACS_INT b1 = ch >> 8;
9303       int b2 = ch & 0x7F;
9304       if (b1 < 0xA1 || b1 > 0xFE
9305           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9306         error ("Invalid code: %"pI"d", ch);
9307       c = ch;
9308       charset = charset_big5;
9309     }
9310   c = DECODE_CHAR (charset, c);
9311   if (c < 0)
9312     error ("Invalid code: %"pI"d", ch);
9313   return make_number (c);
9314 }
9315
9316 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9317        doc: /* Encode the Big5 character CH to BIG5 coding system.
9318 Return the corresponding character code in Big5.  */)
9319   (Lisp_Object ch)
9320 {
9321   Lisp_Object spec, attrs, charset_list;
9322   struct charset *charset;
9323   int c;
9324   unsigned code;
9325
9326   CHECK_CHARACTER (ch);
9327   c = XFASTINT (ch);
9328   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9329   attrs = AREF (spec, 0);
9330   if (ASCII_CHAR_P (c)
9331       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9332     return ch;
9333
9334   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9335   charset = char_charset (c, charset_list, &code);
9336   if (code == CHARSET_INVALID_CODE (charset))
9337     error ("Can't encode by Big5 encoding: %c", c);
9338
9339   return make_number (code);
9340 }
9341
9342 \f
9343 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9344        Sset_terminal_coding_system_internal, 1, 2, 0,
9345        doc: /* Internal use only.  */)
9346   (Lisp_Object coding_system, Lisp_Object terminal)
9347 {
9348   struct terminal *term = get_terminal (terminal, 1);
9349   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9350   CHECK_SYMBOL (coding_system);
9351   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9352   /* We had better not send unsafe characters to terminal.  */
9353   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9354   /* Character composition should be disabled.  */
9355   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9356   terminal_coding->src_multibyte = 1;
9357   terminal_coding->dst_multibyte = 0;
9358   tset_charset_list
9359     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9360             ? coding_charset_list (terminal_coding)
9361             : Fcons (make_number (charset_ascii), Qnil)));
9362   return Qnil;
9363 }
9364
9365 DEFUN ("set-safe-terminal-coding-system-internal",
9366        Fset_safe_terminal_coding_system_internal,
9367        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9368        doc: /* Internal use only.  */)
9369   (Lisp_Object coding_system)
9370 {
9371   CHECK_SYMBOL (coding_system);
9372   setup_coding_system (Fcheck_coding_system (coding_system),
9373                        &safe_terminal_coding);
9374   /* Character composition should be disabled.  */
9375   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9376   safe_terminal_coding.src_multibyte = 1;
9377   safe_terminal_coding.dst_multibyte = 0;
9378   return Qnil;
9379 }
9380
9381 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9382        Sterminal_coding_system, 0, 1, 0,
9383        doc: /* Return coding system specified for terminal output on the given terminal.
9384 TERMINAL may be a terminal object, a frame, or nil for the selected
9385 frame's terminal device.  */)
9386   (Lisp_Object terminal)
9387 {
9388   struct coding_system *terminal_coding
9389     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9390   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9391
9392   /* For backward compatibility, return nil if it is `undecided'.  */
9393   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9394 }
9395
9396 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9397        Sset_keyboard_coding_system_internal, 1, 2, 0,
9398        doc: /* Internal use only.  */)
9399   (Lisp_Object coding_system, Lisp_Object terminal)
9400 {
9401   struct terminal *t = get_terminal (terminal, 1);
9402   CHECK_SYMBOL (coding_system);
9403   if (NILP (coding_system))
9404     coding_system = Qno_conversion;
9405   else
9406     Fcheck_coding_system (coding_system);
9407   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9408   /* Character composition should be disabled.  */
9409   TERMINAL_KEYBOARD_CODING (t)->common_flags
9410     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9411   return Qnil;
9412 }
9413
9414 DEFUN ("keyboard-coding-system",
9415        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9416        doc: /* Return coding system specified for decoding keyboard input.  */)
9417   (Lisp_Object terminal)
9418 {
9419   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9420                          (get_terminal (terminal, 1))->id);
9421 }
9422
9423 \f
9424 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9425        Sfind_operation_coding_system,  1, MANY, 0,
9426        doc: /* Choose a coding system for an operation based on the target name.
9427 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9428 DECODING-SYSTEM is the coding system to use for decoding
9429 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9430 for encoding (in case OPERATION does encoding).
9431
9432 The first argument OPERATION specifies an I/O primitive:
9433   For file I/O, `insert-file-contents' or `write-region'.
9434   For process I/O, `call-process', `call-process-region', or `start-process'.
9435   For network I/O, `open-network-stream'.
9436
9437 The remaining arguments should be the same arguments that were passed
9438 to the primitive.  Depending on which primitive, one of those arguments
9439 is selected as the TARGET.  For example, if OPERATION does file I/O,
9440 whichever argument specifies the file name is TARGET.
9441
9442 TARGET has a meaning which depends on OPERATION:
9443   For file I/O, TARGET is a file name (except for the special case below).
9444   For process I/O, TARGET is a process name.
9445   For network I/O, TARGET is a service name or a port number.
9446
9447 This function looks up what is specified for TARGET in
9448 `file-coding-system-alist', `process-coding-system-alist',
9449 or `network-coding-system-alist' depending on OPERATION.
9450 They may specify a coding system, a cons of coding systems,
9451 or a function symbol to call.
9452 In the last case, we call the function with one argument,
9453 which is a list of all the arguments given to this function.
9454 If the function can't decide a coding system, it can return
9455 `undecided' so that the normal code-detection is performed.
9456
9457 If OPERATION is `insert-file-contents', the argument corresponding to
9458 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9459 file name to look up, and BUFFER is a buffer that contains the file's
9460 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9461 function to call for FILENAME, that function should examine the
9462 contents of BUFFER instead of reading the file.
9463
9464 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9465   (ptrdiff_t nargs, Lisp_Object *args)
9466 {
9467   Lisp_Object operation, target_idx, target, val;
9468   register Lisp_Object chain;
9469
9470   if (nargs < 2)
9471     error ("Too few arguments");
9472   operation = args[0];
9473   if (!SYMBOLP (operation)
9474       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9475     error ("Invalid first argument");
9476   if (nargs <= 1 + XFASTINT (target_idx))
9477     error ("Too few arguments for operation `%s'",
9478            SDATA (SYMBOL_NAME (operation)));
9479   target = args[XFASTINT (target_idx) + 1];
9480   if (!(STRINGP (target)
9481         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9482             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9483         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9484     error ("Invalid argument %"pI"d of operation `%s'",
9485            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9486   if (CONSP (target))
9487     target = XCAR (target);
9488
9489   chain = ((EQ (operation, Qinsert_file_contents)
9490             || EQ (operation, Qwrite_region))
9491            ? Vfile_coding_system_alist
9492            : (EQ (operation, Qopen_network_stream)
9493               ? Vnetwork_coding_system_alist
9494               : Vprocess_coding_system_alist));
9495   if (NILP (chain))
9496     return Qnil;
9497
9498   for (; CONSP (chain); chain = XCDR (chain))
9499     {
9500       Lisp_Object elt;
9501
9502       elt = XCAR (chain);
9503       if (CONSP (elt)
9504           && ((STRINGP (target)
9505                && STRINGP (XCAR (elt))
9506                && fast_string_match (XCAR (elt), target) >= 0)
9507               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9508         {
9509           val = XCDR (elt);
9510           /* Here, if VAL is both a valid coding system and a valid
9511              function symbol, we return VAL as a coding system.  */
9512           if (CONSP (val))
9513             return val;
9514           if (! SYMBOLP (val))
9515             return Qnil;
9516           if (! NILP (Fcoding_system_p (val)))
9517             return Fcons (val, val);
9518           if (! NILP (Ffboundp (val)))
9519             {
9520               /* We use call1 rather than safe_call1
9521                  so as to get bug reports about functions called here
9522                  which don't handle the current interface.  */
9523               val = call1 (val, Flist (nargs, args));
9524               if (CONSP (val))
9525                 return val;
9526               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9527                 return Fcons (val, val);
9528             }
9529           return Qnil;
9530         }
9531     }
9532   return Qnil;
9533 }
9534
9535 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9536        Sset_coding_system_priority, 0, MANY, 0,
9537        doc: /* Assign higher priority to the coding systems given as arguments.
9538 If multiple coding systems belong to the same category,
9539 all but the first one are ignored.
9540
9541 usage: (set-coding-system-priority &rest coding-systems)  */)
9542   (ptrdiff_t nargs, Lisp_Object *args)
9543 {
9544   ptrdiff_t i, j;
9545   bool changed[coding_category_max];
9546   enum coding_category priorities[coding_category_max];
9547
9548   memset (changed, 0, sizeof changed);
9549
9550   for (i = j = 0; i < nargs; i++)
9551     {
9552       enum coding_category category;
9553       Lisp_Object spec, attrs;
9554
9555       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9556       attrs = AREF (spec, 0);
9557       category = XINT (CODING_ATTR_CATEGORY (attrs));
9558       if (changed[category])
9559         /* Ignore this coding system because a coding system of the
9560            same category already had a higher priority.  */
9561         continue;
9562       changed[category] = 1;
9563       priorities[j++] = category;
9564       if (coding_categories[category].id >= 0
9565           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9566         setup_coding_system (args[i], &coding_categories[category]);
9567       Fset (AREF (Vcoding_category_table, category), args[i]);
9568     }
9569
9570   /* Now we have decided top J priorities.  Reflect the order of the
9571      original priorities to the remaining priorities.  */
9572
9573   for (i = j, j = 0; i < coding_category_max; i++, j++)
9574     {
9575       while (j < coding_category_max
9576              && changed[coding_priorities[j]])
9577         j++;
9578       if (j == coding_category_max)
9579         emacs_abort ();
9580       priorities[i] = coding_priorities[j];
9581     }
9582
9583   memcpy (coding_priorities, priorities, sizeof priorities);
9584
9585   /* Update `coding-category-list'.  */
9586   Vcoding_category_list = Qnil;
9587   for (i = coding_category_max; i-- > 0; )
9588     Vcoding_category_list
9589       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9590                Vcoding_category_list);
9591
9592   return Qnil;
9593 }
9594
9595 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9596        Scoding_system_priority_list, 0, 1, 0,
9597        doc: /* Return a list of coding systems ordered by their priorities.
9598 The list contains a subset of coding systems; i.e. coding systems
9599 assigned to each coding category (see `coding-category-list').
9600
9601 HIGHESTP non-nil means just return the highest priority one.  */)
9602   (Lisp_Object highestp)
9603 {
9604   int i;
9605   Lisp_Object val;
9606
9607   for (i = 0, val = Qnil; i < coding_category_max; i++)
9608     {
9609       enum coding_category category = coding_priorities[i];
9610       int id = coding_categories[category].id;
9611       Lisp_Object attrs;
9612
9613       if (id < 0)
9614         continue;
9615       attrs = CODING_ID_ATTRS (id);
9616       if (! NILP (highestp))
9617         return CODING_ATTR_BASE_NAME (attrs);
9618       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9619     }
9620   return Fnreverse (val);
9621 }
9622
9623 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9624
9625 static Lisp_Object
9626 make_subsidiaries (Lisp_Object base)
9627 {
9628   Lisp_Object subsidiaries;
9629   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9630   char *buf = alloca (base_name_len + 6);
9631   int i;
9632
9633   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9634   subsidiaries = make_uninit_vector (3);
9635   for (i = 0; i < 3; i++)
9636     {
9637       strcpy (buf + base_name_len, suffixes[i]);
9638       ASET (subsidiaries, i, intern (buf));
9639     }
9640   return subsidiaries;
9641 }
9642
9643
9644 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9645        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9646        doc: /* For internal use only.
9647 usage: (define-coding-system-internal ...)  */)
9648   (ptrdiff_t nargs, Lisp_Object *args)
9649 {
9650   Lisp_Object name;
9651   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9652   Lisp_Object attrs;            /* Vector of attributes.  */
9653   Lisp_Object eol_type;
9654   Lisp_Object aliases;
9655   Lisp_Object coding_type, charset_list, safe_charsets;
9656   enum coding_category category;
9657   Lisp_Object tail, val;
9658   int max_charset_id = 0;
9659   int i;
9660
9661   if (nargs < coding_arg_max)
9662     goto short_args;
9663
9664   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9665
9666   name = args[coding_arg_name];
9667   CHECK_SYMBOL (name);
9668   ASET (attrs, coding_attr_base_name, name);
9669
9670   val = args[coding_arg_mnemonic];
9671   if (! STRINGP (val))
9672     CHECK_CHARACTER (val);
9673   ASET (attrs, coding_attr_mnemonic, val);
9674
9675   coding_type = args[coding_arg_coding_type];
9676   CHECK_SYMBOL (coding_type);
9677   ASET (attrs, coding_attr_type, coding_type);
9678
9679   charset_list = args[coding_arg_charset_list];
9680   if (SYMBOLP (charset_list))
9681     {
9682       if (EQ (charset_list, Qiso_2022))
9683         {
9684           if (! EQ (coding_type, Qiso_2022))
9685             error ("Invalid charset-list");
9686           charset_list = Viso_2022_charset_list;
9687         }
9688       else if (EQ (charset_list, Qemacs_mule))
9689         {
9690           if (! EQ (coding_type, Qemacs_mule))
9691             error ("Invalid charset-list");
9692           charset_list = Vemacs_mule_charset_list;
9693         }
9694       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9695         {
9696           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9697             error ("Invalid charset-list");
9698           if (max_charset_id < XFASTINT (XCAR (tail)))
9699             max_charset_id = XFASTINT (XCAR (tail));
9700         }
9701     }
9702   else
9703     {
9704       charset_list = Fcopy_sequence (charset_list);
9705       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9706         {
9707           struct charset *charset;
9708
9709           val = XCAR (tail);
9710           CHECK_CHARSET_GET_CHARSET (val, charset);
9711           if (EQ (coding_type, Qiso_2022)
9712               ? CHARSET_ISO_FINAL (charset) < 0
9713               : EQ (coding_type, Qemacs_mule)
9714               ? CHARSET_EMACS_MULE_ID (charset) < 0
9715               : 0)
9716             error ("Can't handle charset `%s'",
9717                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9718
9719           XSETCAR (tail, make_number (charset->id));
9720           if (max_charset_id < charset->id)
9721             max_charset_id = charset->id;
9722         }
9723     }
9724   ASET (attrs, coding_attr_charset_list, charset_list);
9725
9726   safe_charsets = make_uninit_string (max_charset_id + 1);
9727   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9728   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9729     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9730   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9731
9732   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9733
9734   val = args[coding_arg_decode_translation_table];
9735   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9736     CHECK_SYMBOL (val);
9737   ASET (attrs, coding_attr_decode_tbl, val);
9738
9739   val = args[coding_arg_encode_translation_table];
9740   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9741     CHECK_SYMBOL (val);
9742   ASET (attrs, coding_attr_encode_tbl, val);
9743
9744   val = args[coding_arg_post_read_conversion];
9745   CHECK_SYMBOL (val);
9746   ASET (attrs, coding_attr_post_read, val);
9747
9748   val = args[coding_arg_pre_write_conversion];
9749   CHECK_SYMBOL (val);
9750   ASET (attrs, coding_attr_pre_write, val);
9751
9752   val = args[coding_arg_default_char];
9753   if (NILP (val))
9754     ASET (attrs, coding_attr_default_char, make_number (' '));
9755   else
9756     {
9757       CHECK_CHARACTER (val);
9758       ASET (attrs, coding_attr_default_char, val);
9759     }
9760
9761   val = args[coding_arg_for_unibyte];
9762   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9763
9764   val = args[coding_arg_plist];
9765   CHECK_LIST (val);
9766   ASET (attrs, coding_attr_plist, val);
9767
9768   if (EQ (coding_type, Qcharset))
9769     {
9770       /* Generate a lisp vector of 256 elements.  Each element is nil,
9771          integer, or a list of charset IDs.
9772
9773          If Nth element is nil, the byte code N is invalid in this
9774          coding system.
9775
9776          If Nth element is a number NUM, N is the first byte of a
9777          charset whose ID is NUM.
9778
9779          If Nth element is a list of charset IDs, N is the first byte
9780          of one of them.  The list is sorted by dimensions of the
9781          charsets.  A charset of smaller dimension comes first. */
9782       val = Fmake_vector (make_number (256), Qnil);
9783
9784       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9785         {
9786           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9787           int dim = CHARSET_DIMENSION (charset);
9788           int idx = (dim - 1) * 4;
9789
9790           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9791             ASET (attrs, coding_attr_ascii_compat, Qt);
9792
9793           for (i = charset->code_space[idx];
9794                i <= charset->code_space[idx + 1]; i++)
9795             {
9796               Lisp_Object tmp, tmp2;
9797               int dim2;
9798
9799               tmp = AREF (val, i);
9800               if (NILP (tmp))
9801                 tmp = XCAR (tail);
9802               else if (NUMBERP (tmp))
9803                 {
9804                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9805                   if (dim < dim2)
9806                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9807                   else
9808                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9809                 }
9810               else
9811                 {
9812                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9813                     {
9814                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9815                       if (dim < dim2)
9816                         break;
9817                     }
9818                   if (NILP (tmp2))
9819                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9820                   else
9821                     {
9822                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9823                       XSETCAR (tmp2, XCAR (tail));
9824                     }
9825                 }
9826               ASET (val, i, tmp);
9827             }
9828         }
9829       ASET (attrs, coding_attr_charset_valids, val);
9830       category = coding_category_charset;
9831     }
9832   else if (EQ (coding_type, Qccl))
9833     {
9834       Lisp_Object valids;
9835
9836       if (nargs < coding_arg_ccl_max)
9837         goto short_args;
9838
9839       val = args[coding_arg_ccl_decoder];
9840       CHECK_CCL_PROGRAM (val);
9841       if (VECTORP (val))
9842         val = Fcopy_sequence (val);
9843       ASET (attrs, coding_attr_ccl_decoder, val);
9844
9845       val = args[coding_arg_ccl_encoder];
9846       CHECK_CCL_PROGRAM (val);
9847       if (VECTORP (val))
9848         val = Fcopy_sequence (val);
9849       ASET (attrs, coding_attr_ccl_encoder, val);
9850
9851       val = args[coding_arg_ccl_valids];
9852       valids = Fmake_string (make_number (256), make_number (0));
9853       for (tail = val; CONSP (tail); tail = XCDR (tail))
9854         {
9855           int from, to;
9856
9857           val = XCAR (tail);
9858           if (INTEGERP (val))
9859             {
9860               if (! (0 <= XINT (val) && XINT (val) <= 255))
9861                 args_out_of_range_3 (val, make_number (0), make_number (255));
9862               from = to = XINT (val);
9863             }
9864           else
9865             {
9866               CHECK_CONS (val);
9867               CHECK_NATNUM_CAR (val);
9868               CHECK_NUMBER_CDR (val);
9869               if (XINT (XCAR (val)) > 255)
9870                 args_out_of_range_3 (XCAR (val),
9871                                      make_number (0), make_number (255));
9872               from = XINT (XCAR (val));
9873               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9874                 args_out_of_range_3 (XCDR (val),
9875                                      XCAR (val), make_number (255));
9876               to = XINT (XCDR (val));
9877             }
9878           for (i = from; i <= to; i++)
9879             SSET (valids, i, 1);
9880         }
9881       ASET (attrs, coding_attr_ccl_valids, valids);
9882
9883       category = coding_category_ccl;
9884     }
9885   else if (EQ (coding_type, Qutf_16))
9886     {
9887       Lisp_Object bom, endian;
9888
9889       ASET (attrs, coding_attr_ascii_compat, Qnil);
9890
9891       if (nargs < coding_arg_utf16_max)
9892         goto short_args;
9893
9894       bom = args[coding_arg_utf16_bom];
9895       if (! NILP (bom) && ! EQ (bom, Qt))
9896         {
9897           CHECK_CONS (bom);
9898           val = XCAR (bom);
9899           CHECK_CODING_SYSTEM (val);
9900           val = XCDR (bom);
9901           CHECK_CODING_SYSTEM (val);
9902         }
9903       ASET (attrs, coding_attr_utf_bom, bom);
9904
9905       endian = args[coding_arg_utf16_endian];
9906       CHECK_SYMBOL (endian);
9907       if (NILP (endian))
9908         endian = Qbig;
9909       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9910         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9911       ASET (attrs, coding_attr_utf_16_endian, endian);
9912
9913       category = (CONSP (bom)
9914                   ? coding_category_utf_16_auto
9915                   : NILP (bom)
9916                   ? (EQ (endian, Qbig)
9917                      ? coding_category_utf_16_be_nosig
9918                      : coding_category_utf_16_le_nosig)
9919                   : (EQ (endian, Qbig)
9920                      ? coding_category_utf_16_be
9921                      : coding_category_utf_16_le));
9922     }
9923   else if (EQ (coding_type, Qiso_2022))
9924     {
9925       Lisp_Object initial, reg_usage, request, flags;
9926
9927       if (nargs < coding_arg_iso2022_max)
9928         goto short_args;
9929
9930       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9931       CHECK_VECTOR (initial);
9932       for (i = 0; i < 4; i++)
9933         {
9934           val = AREF (initial, i);
9935           if (! NILP (val))
9936             {
9937               struct charset *charset;
9938
9939               CHECK_CHARSET_GET_CHARSET (val, charset);
9940               ASET (initial, i, make_number (CHARSET_ID (charset)));
9941               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9942                 ASET (attrs, coding_attr_ascii_compat, Qt);
9943             }
9944           else
9945             ASET (initial, i, make_number (-1));
9946         }
9947
9948       reg_usage = args[coding_arg_iso2022_reg_usage];
9949       CHECK_CONS (reg_usage);
9950       CHECK_NUMBER_CAR (reg_usage);
9951       CHECK_NUMBER_CDR (reg_usage);
9952
9953       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9954       for (tail = request; CONSP (tail); tail = XCDR (tail))
9955         {
9956           int id;
9957           Lisp_Object tmp1;
9958
9959           val = XCAR (tail);
9960           CHECK_CONS (val);
9961           tmp1 = XCAR (val);
9962           CHECK_CHARSET_GET_ID (tmp1, id);
9963           CHECK_NATNUM_CDR (val);
9964           if (XINT (XCDR (val)) >= 4)
9965             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9966           XSETCAR (val, make_number (id));
9967         }
9968
9969       flags = args[coding_arg_iso2022_flags];
9970       CHECK_NATNUM (flags);
9971       i = XINT (flags) & INT_MAX;
9972       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9973         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9974       flags = make_number (i);
9975
9976       ASET (attrs, coding_attr_iso_initial, initial);
9977       ASET (attrs, coding_attr_iso_usage, reg_usage);
9978       ASET (attrs, coding_attr_iso_request, request);
9979       ASET (attrs, coding_attr_iso_flags, flags);
9980       setup_iso_safe_charsets (attrs);
9981
9982       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9983         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9984                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9985                     ? coding_category_iso_7_else
9986                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9987                     ? coding_category_iso_7
9988                     : coding_category_iso_7_tight);
9989       else
9990         {
9991           int id = XINT (AREF (initial, 1));
9992
9993           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9994                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9995                        || id < 0)
9996                       ? coding_category_iso_8_else
9997                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9998                       ? coding_category_iso_8_1
9999                       : coding_category_iso_8_2);
10000         }
10001       if (category != coding_category_iso_8_1
10002           && category != coding_category_iso_8_2)
10003         ASET (attrs, coding_attr_ascii_compat, Qnil);
10004     }
10005   else if (EQ (coding_type, Qemacs_mule))
10006     {
10007       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10008         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10009       ASET (attrs, coding_attr_ascii_compat, Qt);
10010       category = coding_category_emacs_mule;
10011     }
10012   else if (EQ (coding_type, Qshift_jis))
10013     {
10014
10015       struct charset *charset;
10016
10017       if (XINT (Flength (charset_list)) != 3
10018           && XINT (Flength (charset_list)) != 4)
10019         error ("There should be three or four charsets");
10020
10021       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10022       if (CHARSET_DIMENSION (charset) != 1)
10023         error ("Dimension of charset %s is not one",
10024                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10025       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10026         ASET (attrs, coding_attr_ascii_compat, Qt);
10027
10028       charset_list = XCDR (charset_list);
10029       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10030       if (CHARSET_DIMENSION (charset) != 1)
10031         error ("Dimension of charset %s is not one",
10032                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10033
10034       charset_list = XCDR (charset_list);
10035       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10036       if (CHARSET_DIMENSION (charset) != 2)
10037         error ("Dimension of charset %s is not two",
10038                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10039
10040       charset_list = XCDR (charset_list);
10041       if (! NILP (charset_list))
10042         {
10043           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10044           if (CHARSET_DIMENSION (charset) != 2)
10045             error ("Dimension of charset %s is not two",
10046                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10047         }
10048
10049       category = coding_category_sjis;
10050       Vsjis_coding_system = name;
10051     }
10052   else if (EQ (coding_type, Qbig5))
10053     {
10054       struct charset *charset;
10055
10056       if (XINT (Flength (charset_list)) != 2)
10057         error ("There should be just two charsets");
10058
10059       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10060       if (CHARSET_DIMENSION (charset) != 1)
10061         error ("Dimension of charset %s is not one",
10062                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10063       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10064         ASET (attrs, coding_attr_ascii_compat, Qt);
10065
10066       charset_list = XCDR (charset_list);
10067       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10068       if (CHARSET_DIMENSION (charset) != 2)
10069         error ("Dimension of charset %s is not two",
10070                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10071
10072       category = coding_category_big5;
10073       Vbig5_coding_system = name;
10074     }
10075   else if (EQ (coding_type, Qraw_text))
10076     {
10077       category = coding_category_raw_text;
10078       ASET (attrs, coding_attr_ascii_compat, Qt);
10079     }
10080   else if (EQ (coding_type, Qutf_8))
10081     {
10082       Lisp_Object bom;
10083
10084       if (nargs < coding_arg_utf8_max)
10085         goto short_args;
10086
10087       bom = args[coding_arg_utf8_bom];
10088       if (! NILP (bom) && ! EQ (bom, Qt))
10089         {
10090           CHECK_CONS (bom);
10091           val = XCAR (bom);
10092           CHECK_CODING_SYSTEM (val);
10093           val = XCDR (bom);
10094           CHECK_CODING_SYSTEM (val);
10095         }
10096       ASET (attrs, coding_attr_utf_bom, bom);
10097       if (NILP (bom))
10098         ASET (attrs, coding_attr_ascii_compat, Qt);
10099
10100       category = (CONSP (bom) ? coding_category_utf_8_auto
10101                   : NILP (bom) ? coding_category_utf_8_nosig
10102                   : coding_category_utf_8_sig);
10103     }
10104   else if (EQ (coding_type, Qundecided))
10105     category = coding_category_undecided;
10106   else
10107     error ("Invalid coding system type: %s",
10108            SDATA (SYMBOL_NAME (coding_type)));
10109
10110   ASET (attrs, coding_attr_category, make_number (category));
10111   ASET (attrs, coding_attr_plist,
10112         Fcons (QCcategory,
10113                Fcons (AREF (Vcoding_category_table, category),
10114                       CODING_ATTR_PLIST (attrs))));
10115   ASET (attrs, coding_attr_plist,
10116         Fcons (QCascii_compatible_p,
10117                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10118                       CODING_ATTR_PLIST (attrs))));
10119
10120   eol_type = args[coding_arg_eol_type];
10121   if (! NILP (eol_type)
10122       && ! EQ (eol_type, Qunix)
10123       && ! EQ (eol_type, Qdos)
10124       && ! EQ (eol_type, Qmac))
10125     error ("Invalid eol-type");
10126
10127   aliases = Fcons (name, Qnil);
10128
10129   if (NILP (eol_type))
10130     {
10131       eol_type = make_subsidiaries (name);
10132       for (i = 0; i < 3; i++)
10133         {
10134           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10135
10136           this_name = AREF (eol_type, i);
10137           this_aliases = Fcons (this_name, Qnil);
10138           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10139           this_spec = make_uninit_vector (3);
10140           ASET (this_spec, 0, attrs);
10141           ASET (this_spec, 1, this_aliases);
10142           ASET (this_spec, 2, this_eol_type);
10143           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10144           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10145           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10146           if (NILP (val))
10147             Vcoding_system_alist
10148               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10149                        Vcoding_system_alist);
10150         }
10151     }
10152
10153   spec_vec = make_uninit_vector (3);
10154   ASET (spec_vec, 0, attrs);
10155   ASET (spec_vec, 1, aliases);
10156   ASET (spec_vec, 2, eol_type);
10157
10158   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10159   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10160   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10161   if (NILP (val))
10162     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10163                                   Vcoding_system_alist);
10164
10165   {
10166     int id = coding_categories[category].id;
10167
10168     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10169       setup_coding_system (name, &coding_categories[category]);
10170   }
10171
10172   return Qnil;
10173
10174  short_args:
10175   return Fsignal (Qwrong_number_of_arguments,
10176                   Fcons (intern ("define-coding-system-internal"),
10177                          make_number (nargs)));
10178 }
10179
10180
10181 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10182        3, 3, 0,
10183        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10184   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10185 {
10186   Lisp_Object spec, attrs;
10187
10188   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10189   attrs = AREF (spec, 0);
10190   if (EQ (prop, QCmnemonic))
10191     {
10192       if (! STRINGP (val))
10193         CHECK_CHARACTER (val);
10194       ASET (attrs, coding_attr_mnemonic, val);
10195     }
10196   else if (EQ (prop, QCdefault_char))
10197     {
10198       if (NILP (val))
10199         val = make_number (' ');
10200       else
10201         CHECK_CHARACTER (val);
10202       ASET (attrs, coding_attr_default_char, val);
10203     }
10204   else if (EQ (prop, QCdecode_translation_table))
10205     {
10206       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10207         CHECK_SYMBOL (val);
10208       ASET (attrs, coding_attr_decode_tbl, val);
10209     }
10210   else if (EQ (prop, QCencode_translation_table))
10211     {
10212       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10213         CHECK_SYMBOL (val);
10214       ASET (attrs, coding_attr_encode_tbl, val);
10215     }
10216   else if (EQ (prop, QCpost_read_conversion))
10217     {
10218       CHECK_SYMBOL (val);
10219       ASET (attrs, coding_attr_post_read, val);
10220     }
10221   else if (EQ (prop, QCpre_write_conversion))
10222     {
10223       CHECK_SYMBOL (val);
10224       ASET (attrs, coding_attr_pre_write, val);
10225     }
10226   else if (EQ (prop, QCascii_compatible_p))
10227     {
10228       ASET (attrs, coding_attr_ascii_compat, val);
10229     }
10230
10231   ASET (attrs, coding_attr_plist,
10232         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10233   return val;
10234 }
10235
10236
10237 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10238        Sdefine_coding_system_alias, 2, 2, 0,
10239        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10240   (Lisp_Object alias, Lisp_Object coding_system)
10241 {
10242   Lisp_Object spec, aliases, eol_type, val;
10243
10244   CHECK_SYMBOL (alias);
10245   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10246   aliases = AREF (spec, 1);
10247   /* ALIASES should be a list of length more than zero, and the first
10248      element is a base coding system.  Append ALIAS at the tail of the
10249      list.  */
10250   while (!NILP (XCDR (aliases)))
10251     aliases = XCDR (aliases);
10252   XSETCDR (aliases, Fcons (alias, Qnil));
10253
10254   eol_type = AREF (spec, 2);
10255   if (VECTORP (eol_type))
10256     {
10257       Lisp_Object subsidiaries;
10258       int i;
10259
10260       subsidiaries = make_subsidiaries (alias);
10261       for (i = 0; i < 3; i++)
10262         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10263                                      AREF (eol_type, i));
10264     }
10265
10266   Fputhash (alias, spec, Vcoding_system_hash_table);
10267   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10268   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10269   if (NILP (val))
10270     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10271                                   Vcoding_system_alist);
10272
10273   return Qnil;
10274 }
10275
10276 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10277        1, 1, 0,
10278        doc: /* Return the base of CODING-SYSTEM.
10279 Any alias or subsidiary coding system is not a base coding system.  */)
10280   (Lisp_Object coding_system)
10281 {
10282   Lisp_Object spec, attrs;
10283
10284   if (NILP (coding_system))
10285     return (Qno_conversion);
10286   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10287   attrs = AREF (spec, 0);
10288   return CODING_ATTR_BASE_NAME (attrs);
10289 }
10290
10291 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10292        1, 1, 0,
10293        doc: "Return the property list of CODING-SYSTEM.")
10294   (Lisp_Object coding_system)
10295 {
10296   Lisp_Object spec, attrs;
10297
10298   if (NILP (coding_system))
10299     coding_system = Qno_conversion;
10300   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10301   attrs = AREF (spec, 0);
10302   return CODING_ATTR_PLIST (attrs);
10303 }
10304
10305
10306 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10307        1, 1, 0,
10308        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10309   (Lisp_Object coding_system)
10310 {
10311   Lisp_Object spec;
10312
10313   if (NILP (coding_system))
10314     coding_system = Qno_conversion;
10315   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10316   return AREF (spec, 1);
10317 }
10318
10319 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10320        Scoding_system_eol_type, 1, 1, 0,
10321        doc: /* Return eol-type of CODING-SYSTEM.
10322 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10323
10324 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10325 and CR respectively.
10326
10327 A vector value indicates that a format of end-of-line should be
10328 detected automatically.  Nth element of the vector is the subsidiary
10329 coding system whose eol-type is N.  */)
10330   (Lisp_Object coding_system)
10331 {
10332   Lisp_Object spec, eol_type;
10333   int n;
10334
10335   if (NILP (coding_system))
10336     coding_system = Qno_conversion;
10337   if (! CODING_SYSTEM_P (coding_system))
10338     return Qnil;
10339   spec = CODING_SYSTEM_SPEC (coding_system);
10340   eol_type = AREF (spec, 2);
10341   if (VECTORP (eol_type))
10342     return Fcopy_sequence (eol_type);
10343   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10344   return make_number (n);
10345 }
10346
10347 #endif /* emacs */
10348
10349 \f
10350 /*** 9. Post-amble ***/
10351
10352 void
10353 init_coding_once (void)
10354 {
10355   int i;
10356
10357   for (i = 0; i < coding_category_max; i++)
10358     {
10359       coding_categories[i].id = -1;
10360       coding_priorities[i] = i;
10361     }
10362
10363   /* ISO2022 specific initialize routine.  */
10364   for (i = 0; i < 0x20; i++)
10365     iso_code_class[i] = ISO_control_0;
10366   for (i = 0x21; i < 0x7F; i++)
10367     iso_code_class[i] = ISO_graphic_plane_0;
10368   for (i = 0x80; i < 0xA0; i++)
10369     iso_code_class[i] = ISO_control_1;
10370   for (i = 0xA1; i < 0xFF; i++)
10371     iso_code_class[i] = ISO_graphic_plane_1;
10372   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10373   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10374   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10375   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10376   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10377   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10378   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10379   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10380   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10381
10382   for (i = 0; i < 256; i++)
10383     {
10384       emacs_mule_bytes[i] = 1;
10385     }
10386   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10387   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10388   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10389   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10390 }
10391
10392 #ifdef emacs
10393
10394 void
10395 syms_of_coding (void)
10396 {
10397   staticpro (&Vcoding_system_hash_table);
10398   {
10399     Lisp_Object args[2];
10400     args[0] = QCtest;
10401     args[1] = Qeq;
10402     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10403   }
10404
10405   staticpro (&Vsjis_coding_system);
10406   Vsjis_coding_system = Qnil;
10407
10408   staticpro (&Vbig5_coding_system);
10409   Vbig5_coding_system = Qnil;
10410
10411   staticpro (&Vcode_conversion_reused_workbuf);
10412   Vcode_conversion_reused_workbuf = Qnil;
10413
10414   staticpro (&Vcode_conversion_workbuf_name);
10415   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10416
10417   reused_workbuf_in_use = 0;
10418
10419   DEFSYM (Qcharset, "charset");
10420   DEFSYM (Qtarget_idx, "target-idx");
10421   DEFSYM (Qcoding_system_history, "coding-system-history");
10422   Fset (Qcoding_system_history, Qnil);
10423
10424   /* Target FILENAME is the first argument.  */
10425   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10426   /* Target FILENAME is the third argument.  */
10427   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10428
10429   DEFSYM (Qcall_process, "call-process");
10430   /* Target PROGRAM is the first argument.  */
10431   Fput (Qcall_process, Qtarget_idx, make_number (0));
10432
10433   DEFSYM (Qcall_process_region, "call-process-region");
10434   /* Target PROGRAM is the third argument.  */
10435   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10436
10437   DEFSYM (Qstart_process, "start-process");
10438   /* Target PROGRAM is the third argument.  */
10439   Fput (Qstart_process, Qtarget_idx, make_number (2));
10440
10441   DEFSYM (Qopen_network_stream, "open-network-stream");
10442   /* Target SERVICE is the fourth argument.  */
10443   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10444
10445   DEFSYM (Qcoding_system, "coding-system");
10446   DEFSYM (Qcoding_aliases, "coding-aliases");
10447
10448   DEFSYM (Qeol_type, "eol-type");
10449   DEFSYM (Qunix, "unix");
10450   DEFSYM (Qdos, "dos");
10451   DEFSYM (Qmac, "mac");
10452
10453   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10454   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10455   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10456   DEFSYM (Qdefault_char, "default-char");
10457   DEFSYM (Qundecided, "undecided");
10458   DEFSYM (Qno_conversion, "no-conversion");
10459   DEFSYM (Qraw_text, "raw-text");
10460
10461   DEFSYM (Qiso_2022, "iso-2022");
10462
10463   DEFSYM (Qutf_8, "utf-8");
10464   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10465
10466 #if defined (WINDOWSNT) || defined (CYGWIN)
10467   /* No, not utf-16-le: that one has a BOM.  */
10468   DEFSYM (Qutf_16le, "utf-16le");
10469 #endif
10470
10471   DEFSYM (Qutf_16, "utf-16");
10472   DEFSYM (Qbig, "big");
10473   DEFSYM (Qlittle, "little");
10474
10475   DEFSYM (Qshift_jis, "shift-jis");
10476   DEFSYM (Qbig5, "big5");
10477
10478   DEFSYM (Qcoding_system_p, "coding-system-p");
10479
10480   DEFSYM (Qcoding_system_error, "coding-system-error");
10481   Fput (Qcoding_system_error, Qerror_conditions,
10482         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10483   Fput (Qcoding_system_error, Qerror_message,
10484         build_pure_c_string ("Invalid coding system"));
10485
10486   /* Intern this now in case it isn't already done.
10487      Setting this variable twice is harmless.
10488      But don't staticpro it here--that is done in alloc.c.  */
10489   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10490
10491   DEFSYM (Qtranslation_table, "translation-table");
10492   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10493   DEFSYM (Qtranslation_table_id, "translation-table-id");
10494   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10495   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10496
10497   DEFSYM (Qvalid_codes, "valid-codes");
10498
10499   DEFSYM (Qemacs_mule, "emacs-mule");
10500
10501   DEFSYM (QCcategory, ":category");
10502   DEFSYM (QCmnemonic, ":mnemonic");
10503   DEFSYM (QCdefault_char, ":default-char");
10504   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10505   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10506   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10507   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10508   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10509
10510   Vcoding_category_table
10511     = Fmake_vector (make_number (coding_category_max), Qnil);
10512   staticpro (&Vcoding_category_table);
10513   /* Followings are target of code detection.  */
10514   ASET (Vcoding_category_table, coding_category_iso_7,
10515         intern_c_string ("coding-category-iso-7"));
10516   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10517         intern_c_string ("coding-category-iso-7-tight"));
10518   ASET (Vcoding_category_table, coding_category_iso_8_1,
10519         intern_c_string ("coding-category-iso-8-1"));
10520   ASET (Vcoding_category_table, coding_category_iso_8_2,
10521         intern_c_string ("coding-category-iso-8-2"));
10522   ASET (Vcoding_category_table, coding_category_iso_7_else,
10523         intern_c_string ("coding-category-iso-7-else"));
10524   ASET (Vcoding_category_table, coding_category_iso_8_else,
10525         intern_c_string ("coding-category-iso-8-else"));
10526   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10527         intern_c_string ("coding-category-utf-8-auto"));
10528   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10529         intern_c_string ("coding-category-utf-8"));
10530   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10531         intern_c_string ("coding-category-utf-8-sig"));
10532   ASET (Vcoding_category_table, coding_category_utf_16_be,
10533         intern_c_string ("coding-category-utf-16-be"));
10534   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10535         intern_c_string ("coding-category-utf-16-auto"));
10536   ASET (Vcoding_category_table, coding_category_utf_16_le,
10537         intern_c_string ("coding-category-utf-16-le"));
10538   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10539         intern_c_string ("coding-category-utf-16-be-nosig"));
10540   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10541         intern_c_string ("coding-category-utf-16-le-nosig"));
10542   ASET (Vcoding_category_table, coding_category_charset,
10543         intern_c_string ("coding-category-charset"));
10544   ASET (Vcoding_category_table, coding_category_sjis,
10545         intern_c_string ("coding-category-sjis"));
10546   ASET (Vcoding_category_table, coding_category_big5,
10547         intern_c_string ("coding-category-big5"));
10548   ASET (Vcoding_category_table, coding_category_ccl,
10549         intern_c_string ("coding-category-ccl"));
10550   ASET (Vcoding_category_table, coding_category_emacs_mule,
10551         intern_c_string ("coding-category-emacs-mule"));
10552   /* Followings are NOT target of code detection.  */
10553   ASET (Vcoding_category_table, coding_category_raw_text,
10554         intern_c_string ("coding-category-raw-text"));
10555   ASET (Vcoding_category_table, coding_category_undecided,
10556         intern_c_string ("coding-category-undecided"));
10557
10558   DEFSYM (Qinsufficient_source, "insufficient-source");
10559   DEFSYM (Qinvalid_source, "invalid-source");
10560   DEFSYM (Qinterrupted, "interrupted");
10561   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10562
10563   defsubr (&Scoding_system_p);
10564   defsubr (&Sread_coding_system);
10565   defsubr (&Sread_non_nil_coding_system);
10566   defsubr (&Scheck_coding_system);
10567   defsubr (&Sdetect_coding_region);
10568   defsubr (&Sdetect_coding_string);
10569   defsubr (&Sfind_coding_systems_region_internal);
10570   defsubr (&Sunencodable_char_position);
10571   defsubr (&Scheck_coding_systems_region);
10572   defsubr (&Sdecode_coding_region);
10573   defsubr (&Sencode_coding_region);
10574   defsubr (&Sdecode_coding_string);
10575   defsubr (&Sencode_coding_string);
10576   defsubr (&Sdecode_sjis_char);
10577   defsubr (&Sencode_sjis_char);
10578   defsubr (&Sdecode_big5_char);
10579   defsubr (&Sencode_big5_char);
10580   defsubr (&Sset_terminal_coding_system_internal);
10581   defsubr (&Sset_safe_terminal_coding_system_internal);
10582   defsubr (&Sterminal_coding_system);
10583   defsubr (&Sset_keyboard_coding_system_internal);
10584   defsubr (&Skeyboard_coding_system);
10585   defsubr (&Sfind_operation_coding_system);
10586   defsubr (&Sset_coding_system_priority);
10587   defsubr (&Sdefine_coding_system_internal);
10588   defsubr (&Sdefine_coding_system_alias);
10589   defsubr (&Scoding_system_put);
10590   defsubr (&Scoding_system_base);
10591   defsubr (&Scoding_system_plist);
10592   defsubr (&Scoding_system_aliases);
10593   defsubr (&Scoding_system_eol_type);
10594   defsubr (&Scoding_system_priority_list);
10595
10596   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10597                doc: /* List of coding systems.
10598
10599 Do not alter the value of this variable manually.  This variable should be
10600 updated by the functions `define-coding-system' and
10601 `define-coding-system-alias'.  */);
10602   Vcoding_system_list = Qnil;
10603
10604   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10605                doc: /* Alist of coding system names.
10606 Each element is one element list of coding system name.
10607 This variable is given to `completing-read' as COLLECTION argument.
10608
10609 Do not alter the value of this variable manually.  This variable should be
10610 updated by the functions `make-coding-system' and
10611 `define-coding-system-alias'.  */);
10612   Vcoding_system_alist = Qnil;
10613
10614   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10615                doc: /* List of coding-categories (symbols) ordered by priority.
10616
10617 On detecting a coding system, Emacs tries code detection algorithms
10618 associated with each coding-category one by one in this order.  When
10619 one algorithm agrees with a byte sequence of source text, the coding
10620 system bound to the corresponding coding-category is selected.
10621
10622 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10623   {
10624     int i;
10625
10626     Vcoding_category_list = Qnil;
10627     for (i = coding_category_max - 1; i >= 0; i--)
10628       Vcoding_category_list
10629         = Fcons (AREF (Vcoding_category_table, i),
10630                  Vcoding_category_list);
10631   }
10632
10633   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10634                doc: /* Specify the coding system for read operations.
10635 It is useful to bind this variable with `let', but do not set it globally.
10636 If the value is a coding system, it is used for decoding on read operation.
10637 If not, an appropriate element is used from one of the coding system alists.
10638 There are three such tables: `file-coding-system-alist',
10639 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10640   Vcoding_system_for_read = Qnil;
10641
10642   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10643                doc: /* Specify the coding system for write operations.
10644 Programs bind this variable with `let', but you should not set it globally.
10645 If the value is a coding system, it is used for encoding of output,
10646 when writing it to a file and when sending it to a file or subprocess.
10647
10648 If this does not specify a coding system, an appropriate element
10649 is used from one of the coding system alists.
10650 There are three such tables: `file-coding-system-alist',
10651 `process-coding-system-alist', and `network-coding-system-alist'.
10652 For output to files, if the above procedure does not specify a coding system,
10653 the value of `buffer-file-coding-system' is used.  */);
10654   Vcoding_system_for_write = Qnil;
10655
10656   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10657                doc: /*
10658 Coding system used in the latest file or process I/O.  */);
10659   Vlast_coding_system_used = Qnil;
10660
10661   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10662                doc: /*
10663 Error status of the last code conversion.
10664
10665 When an error was detected in the last code conversion, this variable
10666 is set to one of the following symbols.
10667   `insufficient-source'
10668   `inconsistent-eol'
10669   `invalid-source'
10670   `interrupted'
10671   `insufficient-memory'
10672 When no error was detected, the value doesn't change.  So, to check
10673 the error status of a code conversion by this variable, you must
10674 explicitly set this variable to nil before performing code
10675 conversion.  */);
10676   Vlast_code_conversion_error = Qnil;
10677
10678   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10679                doc: /*
10680 *Non-nil means always inhibit code conversion of end-of-line format.
10681 See info node `Coding Systems' and info node `Text and Binary' concerning
10682 such conversion.  */);
10683   inhibit_eol_conversion = 0;
10684
10685   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10686                doc: /*
10687 Non-nil means process buffer inherits coding system of process output.
10688 Bind it to t if the process output is to be treated as if it were a file
10689 read from some filesystem.  */);
10690   inherit_process_coding_system = 0;
10691
10692   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10693                doc: /*
10694 Alist to decide a coding system to use for a file I/O operation.
10695 The format is ((PATTERN . VAL) ...),
10696 where PATTERN is a regular expression matching a file name,
10697 VAL is a coding system, a cons of coding systems, or a function symbol.
10698 If VAL is a coding system, it is used for both decoding and encoding
10699 the file contents.
10700 If VAL is a cons of coding systems, the car part is used for decoding,
10701 and the cdr part is used for encoding.
10702 If VAL is a function symbol, the function must return a coding system
10703 or a cons of coding systems which are used as above.  The function is
10704 called with an argument that is a list of the arguments with which
10705 `find-operation-coding-system' was called.  If the function can't decide
10706 a coding system, it can return `undecided' so that the normal
10707 code-detection is performed.
10708
10709 See also the function `find-operation-coding-system'
10710 and the variable `auto-coding-alist'.  */);
10711   Vfile_coding_system_alist = Qnil;
10712
10713   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10714                doc: /*
10715 Alist to decide a coding system to use for a process I/O operation.
10716 The format is ((PATTERN . VAL) ...),
10717 where PATTERN is a regular expression matching a program name,
10718 VAL is a coding system, a cons of coding systems, or a function symbol.
10719 If VAL is a coding system, it is used for both decoding what received
10720 from the program and encoding what sent to the program.
10721 If VAL is a cons of coding systems, the car part is used for decoding,
10722 and the cdr part is used for encoding.
10723 If VAL is a function symbol, the function must return a coding system
10724 or a cons of coding systems which are used as above.
10725
10726 See also the function `find-operation-coding-system'.  */);
10727   Vprocess_coding_system_alist = Qnil;
10728
10729   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10730                doc: /*
10731 Alist to decide a coding system to use for a network I/O operation.
10732 The format is ((PATTERN . VAL) ...),
10733 where PATTERN is a regular expression matching a network service name
10734 or is a port number to connect to,
10735 VAL is a coding system, a cons of coding systems, or a function symbol.
10736 If VAL is a coding system, it is used for both decoding what received
10737 from the network stream and encoding what sent to the network stream.
10738 If VAL is a cons of coding systems, the car part is used for decoding,
10739 and the cdr part is used for encoding.
10740 If VAL is a function symbol, the function must return a coding system
10741 or a cons of coding systems which are used as above.
10742
10743 See also the function `find-operation-coding-system'.  */);
10744   Vnetwork_coding_system_alist = Qnil;
10745
10746   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10747                doc: /* Coding system to use with system messages.
10748 Also used for decoding keyboard input on X Window system.  */);
10749   Vlocale_coding_system = Qnil;
10750
10751   /* The eol mnemonics are reset in startup.el system-dependently.  */
10752   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10753                doc: /*
10754 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10755   eol_mnemonic_unix = build_pure_c_string (":");
10756
10757   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10758                doc: /*
10759 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10760   eol_mnemonic_dos = build_pure_c_string ("\\");
10761
10762   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10763                doc: /*
10764 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10765   eol_mnemonic_mac = build_pure_c_string ("/");
10766
10767   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10768                doc: /*
10769 *String displayed in mode line when end-of-line format is not yet determined.  */);
10770   eol_mnemonic_undecided = build_pure_c_string (":");
10771
10772   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10773                doc: /*
10774 *Non-nil enables character translation while encoding and decoding.  */);
10775   Venable_character_translation = Qt;
10776
10777   DEFVAR_LISP ("standard-translation-table-for-decode",
10778                Vstandard_translation_table_for_decode,
10779                doc: /* Table for translating characters while decoding.  */);
10780   Vstandard_translation_table_for_decode = Qnil;
10781
10782   DEFVAR_LISP ("standard-translation-table-for-encode",
10783                Vstandard_translation_table_for_encode,
10784                doc: /* Table for translating characters while encoding.  */);
10785   Vstandard_translation_table_for_encode = Qnil;
10786
10787   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10788                doc: /* Alist of charsets vs revision numbers.
10789 While encoding, if a charset (car part of an element) is found,
10790 designate it with the escape sequence identifying revision (cdr part
10791 of the element).  */);
10792   Vcharset_revision_table = Qnil;
10793
10794   DEFVAR_LISP ("default-process-coding-system",
10795                Vdefault_process_coding_system,
10796                doc: /* Cons of coding systems used for process I/O by default.
10797 The car part is used for decoding a process output,
10798 the cdr part is used for encoding a text to be sent to a process.  */);
10799   Vdefault_process_coding_system = Qnil;
10800
10801   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10802                doc: /*
10803 Table of extra Latin codes in the range 128..159 (inclusive).
10804 This is a vector of length 256.
10805 If Nth element is non-nil, the existence of code N in a file
10806 \(or output of subprocess) doesn't prevent it to be detected as
10807 a coding system of ISO 2022 variant which has a flag
10808 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10809 or reading output of a subprocess.
10810 Only 128th through 159th elements have a meaning.  */);
10811   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10812
10813   DEFVAR_LISP ("select-safe-coding-system-function",
10814                Vselect_safe_coding_system_function,
10815                doc: /*
10816 Function to call to select safe coding system for encoding a text.
10817
10818 If set, this function is called to force a user to select a proper
10819 coding system which can encode the text in the case that a default
10820 coding system used in each operation can't encode the text.  The
10821 function should take care that the buffer is not modified while
10822 the coding system is being selected.
10823
10824 The default value is `select-safe-coding-system' (which see).  */);
10825   Vselect_safe_coding_system_function = Qnil;
10826
10827   DEFVAR_BOOL ("coding-system-require-warning",
10828                coding_system_require_warning,
10829                doc: /* Internal use only.
10830 If non-nil, on writing a file, `select-safe-coding-system-function' is
10831 called even if `coding-system-for-write' is non-nil.  The command
10832 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10833   coding_system_require_warning = 0;
10834
10835
10836   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10837                inhibit_iso_escape_detection,
10838                doc: /*
10839 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10840
10841 When Emacs reads text, it tries to detect how the text is encoded.
10842 This code detection is sensitive to escape sequences.  If Emacs sees
10843 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10844 of the ISO2022 encodings, and decodes text by the corresponding coding
10845 system (e.g. `iso-2022-7bit').
10846
10847 However, there may be a case that you want to read escape sequences in
10848 a file as is.  In such a case, you can set this variable to non-nil.
10849 Then the code detection will ignore any escape sequences, and no text is
10850 detected as encoded in some ISO-2022 encoding.  The result is that all
10851 escape sequences become visible in a buffer.
10852
10853 The default value is nil, and it is strongly recommended not to change
10854 it.  That is because many Emacs Lisp source files that contain
10855 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10856 in Emacs's distribution, and they won't be decoded correctly on
10857 reading if you suppress escape sequence detection.
10858
10859 The other way to read escape sequences in a file without decoding is
10860 to explicitly specify some coding system that doesn't use ISO-2022
10861 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
10862   inhibit_iso_escape_detection = 0;
10863
10864   DEFVAR_BOOL ("inhibit-null-byte-detection",
10865                inhibit_null_byte_detection,
10866                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10867 By default, Emacs treats it as binary data, and does not attempt to
10868 decode it.  The effect is as if you specified `no-conversion' for
10869 reading that text.
10870
10871 Set this to non-nil when a regular text happens to include null bytes.
10872 Examples are Index nodes of Info files and null-byte delimited output
10873 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10874 decode text as usual.  */);
10875   inhibit_null_byte_detection = 0;
10876
10877   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
10878                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
10879 Internal use only.  Removed after the experimental optimizer gets stable. */);
10880   disable_ascii_optimization = 1;
10881
10882   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10883                doc: /* Char table for translating self-inserting characters.
10884 This is applied to the result of input methods, not their input.
10885 See also `keyboard-translate-table'.
10886
10887 Use of this variable for character code unification was rendered
10888 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10889 internal character representation.  */);
10890     Vtranslation_table_for_input = Qnil;
10891
10892   {
10893     Lisp_Object args[coding_arg_max];
10894     Lisp_Object plist[16];
10895     int i;
10896
10897     for (i = 0; i < coding_arg_max; i++)
10898       args[i] = Qnil;
10899
10900     plist[0] = intern_c_string (":name");
10901     plist[1] = args[coding_arg_name] = Qno_conversion;
10902     plist[2] = intern_c_string (":mnemonic");
10903     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10904     plist[4] = intern_c_string (":coding-type");
10905     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10906     plist[6] = intern_c_string (":ascii-compatible-p");
10907     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10908     plist[8] = intern_c_string (":default-char");
10909     plist[9] = args[coding_arg_default_char] = make_number (0);
10910     plist[10] = intern_c_string (":for-unibyte");
10911     plist[11] = args[coding_arg_for_unibyte] = Qt;
10912     plist[12] = intern_c_string (":docstring");
10913     plist[13] = build_pure_c_string ("Do no conversion.\n\
10914 \n\
10915 When you visit a file with this coding, the file is read into a\n\
10916 unibyte buffer as is, thus each byte of a file is treated as a\n\
10917 character.");
10918     plist[14] = intern_c_string (":eol-type");
10919     plist[15] = args[coding_arg_eol_type] = Qunix;
10920     args[coding_arg_plist] = Flist (16, plist);
10921     Fdefine_coding_system_internal (coding_arg_max, args);
10922
10923     plist[1] = args[coding_arg_name] = Qundecided;
10924     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10925     plist[5] = args[coding_arg_coding_type] = Qundecided;
10926     /* This is already set.
10927        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10928     plist[8] = intern_c_string (":charset-list");
10929     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10930     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10931     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10932     plist[15] = args[coding_arg_eol_type] = Qnil;
10933     args[coding_arg_plist] = Flist (16, plist);
10934     Fdefine_coding_system_internal (coding_arg_max, args);
10935   }
10936
10937   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10938
10939   {
10940     int i;
10941
10942     for (i = 0; i < coding_category_max; i++)
10943       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10944   }
10945 #if defined (DOS_NT)
10946   system_eol_type = Qdos;
10947 #else
10948   system_eol_type = Qunix;
10949 #endif
10950   staticpro (&system_eol_type);
10951 }
10952
10953 char *
10954 emacs_strerror (int error_number)
10955 {
10956   char *str;
10957
10958   synchronize_system_messages_locale ();
10959   str = strerror (error_number);
10960
10961   if (! NILP (Vlocale_coding_system))
10962     {
10963       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10964                                                       Vlocale_coding_system,
10965                                                       0);
10966       str = SSDATA (dec);
10967     }
10968
10969   return str;
10970 }
10971
10972 #endif /* emacs */