src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292 #include <setjmp.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302 #include "frame.h"
 303 #include "termhooks.h"
 304
 305 Lisp_Object Vcoding_system_hash_table;
 306
 307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 308 Lisp_Object Qunix, Qdos;
 309 extern Lisp_Object Qmac;        /* frame.c */
 310 Lisp_Object Qbuffer_file_coding_system;
 311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 315 Lisp_Object Qbig, Qlittle;
 316 Lisp_Object Qcoding_system_history;
 317 Lisp_Object Qvalid_codes;
 318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 321 Lisp_Object QCascii_compatible_p;
 322
 323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 324 Lisp_Object Qcall_process, Qcall_process_region;
 325 Lisp_Object Qstart_process, Qopen_network_stream;
 326 Lisp_Object Qtarget_idx;
 327
 328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 329 Lisp_Object Qinterrupted, Qinsufficient_memory;
 330
 331 extern Lisp_Object Qcompletion_ignore_case;
 332
 333 /* If a symbol has this property, evaluate the value to define the
 334    symbol as a coding system.  */
 335 static Lisp_Object Qcoding_system_define_form;
 336
 337 int coding_system_require_warning;
 338
 339 Lisp_Object Vselect_safe_coding_system_function;
 340
 341 /* Mnemonic string for each format of end-of-line.  */
 342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 343 /* Mnemonic string to indicate format of end-of-line is not yet
 344    decided.  */
 345 Lisp_Object eol_mnemonic_undecided;
 346
 347 /* Format of end-of-line decided by system.  This is Qunix on
 348    Unix and Mac, Qdos on DOS/Windows.
 349    This has an effect only for external encoding (i.e. for output to
 350    file and process), not for in-buffer or Lisp string encoding.  */
 351 static Lisp_Object system_eol_type;
 352
 353 #ifdef emacs
 354
 355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 356
 357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 358
 359 /* Coding system emacs-mule and raw-text are for converting only
 360    end-of-line format.  */
 361 Lisp_Object Qemacs_mule, Qraw_text;
 362 Lisp_Object Qutf_8_emacs;
 363
 364 /* Coding-systems are handed between Emacs Lisp programs and C internal
 365    routines by the following three variables.  */
 366 /* Coding-system for reading files and receiving data from process.  */
 367 Lisp_Object Vcoding_system_for_read;
 368 /* Coding-system for writing files and sending data to process.  */
 369 Lisp_Object Vcoding_system_for_write;
 370 /* Coding-system actually used in the latest I/O.  */
 371 Lisp_Object Vlast_coding_system_used;
 372 /* Set to non-nil when an error is detected while code conversion.  */
 373 Lisp_Object Vlast_code_conversion_error;
 374 /* A vector of length 256 which contains information about special
 375    Latin codes (especially for dealing with Microsoft codes).  */
 376 Lisp_Object Vlatin_extra_code_table;
 377
 378 /* Flag to inhibit code conversion of end-of-line format.  */
 379 int inhibit_eol_conversion;
 380
 381 /* Flag to inhibit ISO2022 escape sequence detection.  */
 382 int inhibit_iso_escape_detection;
 383
 384 /* Flag to inhibit detection of binary files through null bytes.  */
 385 int inhibit_null_byte_detection;
 386
 387 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 388 int inherit_process_coding_system;
 389
 390 /* Coding system to be used to encode text for terminal display when
 391    terminal coding system is nil.  */
 392 struct coding_system safe_terminal_coding;
 393
 394 Lisp_Object Vfile_coding_system_alist;
 395 Lisp_Object Vprocess_coding_system_alist;
 396 Lisp_Object Vnetwork_coding_system_alist;
 397
 398 Lisp_Object Vlocale_coding_system;
 399
 400 #endif /* emacs */
 401
 402 /* Flag to tell if we look up translation table on character code
 403    conversion.  */
 404 Lisp_Object Venable_character_translation;
 405 /* Standard translation table to look up on decoding (reading).  */
 406 Lisp_Object Vstandard_translation_table_for_decode;
 407 /* Standard translation table to look up on encoding (writing).  */
 408 Lisp_Object Vstandard_translation_table_for_encode;
 409
 410 Lisp_Object Qtranslation_table;
 411 Lisp_Object Qtranslation_table_id;
 412 Lisp_Object Qtranslation_table_for_decode;
 413 Lisp_Object Qtranslation_table_for_encode;
 414
 415 /* Alist of charsets vs revision number.  */
 416 static Lisp_Object Vcharset_revision_table;
 417
 418 /* Default coding systems used for process I/O.  */
 419 Lisp_Object Vdefault_process_coding_system;
 420
 421 /* Char table for translating Quail and self-inserting input.  */
 422 Lisp_Object Vtranslation_table_for_input;
 423
 424 /* Two special coding systems.  */
 425 Lisp_Object Vsjis_coding_system;
 426 Lisp_Object Vbig5_coding_system;
 427
 428 /* ISO2022 section */
 429
 430 #define CODING_ISO_INITIAL(coding, reg)                 \
 431   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 432                      coding_attr_iso_initial),          \
 433                reg)))
 434
 435
 436 #define CODING_ISO_REQUEST(coding, charset_id)          \
 437   (((charset_id) <= (coding)->max_charset_id            \
 438     ? ((coding)->safe_charsets[charset_id] != 255       \
 439        ? (coding)->safe_charsets[charset_id]            \
 440        : -1)                                            \
 441     : -1))
 442
 443
 444 #define CODING_ISO_FLAGS(coding)        \
 445   ((coding)->spec.iso_2022.flags)
 446 #define CODING_ISO_DESIGNATION(coding, reg)     \
 447   ((coding)->spec.iso_2022.current_designation[reg])
 448 #define CODING_ISO_INVOCATION(coding, plane)    \
 449   ((coding)->spec.iso_2022.current_invocation[plane])
 450 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 451   ((coding)->spec.iso_2022.single_shifting)
 452 #define CODING_ISO_BOL(coding)  \
 453   ((coding)->spec.iso_2022.bol)
 454 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 455   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 456 #define CODING_ISO_CMP_STATUS(coding)   \
 457   (&(coding)->spec.iso_2022.cmp_status)
 458 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 459   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 460 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 461   ((coding)->spec.iso_2022.embedded_utf_8)
 462
 463 /* Control characters of ISO2022.  */
 464                         /* code */      /* function */
 465 #define ISO_CODE_LF     0x0A            /* line-feed */
 466 #define ISO_CODE_CR     0x0D            /* carriage-return */
 467 #define ISO_CODE_SO     0x0E            /* shift-out */
 468 #define ISO_CODE_SI     0x0F            /* shift-in */
 469 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 470 #define ISO_CODE_ESC    0x1B            /* escape */
 471 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 472 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 473 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 474
 475 /* All code (1-byte) of ISO2022 is classified into one of the
 476    followings.  */
 477 enum iso_code_class_type
 478   {
 479     ISO_control_0,              /* Control codes in the range
 480                                    0x00..0x1F and 0x7F, except for the
 481                                    following 5 codes.  */
 482     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 483     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 484     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 485     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 486     ISO_control_1,              /* Control codes in the range
 487                                    0x80..0x9F, except for the
 488                                    following 3 codes.  */
 489     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 490     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 491     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 492     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 493     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 494     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 495     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 496   };
 497
 498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 499     `iso-flags' attribute of an iso2022 coding system.  */
 500
 501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 502    instead of the correct short-form sequence (e.g. ESC $ A).  */
 503 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 504
 505 /* If set, reset graphic planes and registers at end-of-line to the
 506    initial state.  */
 507 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 508
 509 /* If set, reset graphic planes and registers before any control
 510    characters to the initial state.  */
 511 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 512
 513 /* If set, encode by 7-bit environment.  */
 514 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 515
 516 /* If set, use locking-shift function.  */
 517 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 518
 519 /* If set, use single-shift function.  Overwrite
 520    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 521 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 522
 523 /* If set, use designation escape sequence.  */
 524 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 525
 526 /* If set, produce revision number sequence.  */
 527 #define CODING_ISO_FLAG_REVISION        0x0080
 528
 529 /* If set, produce ISO6429's direction specifying sequence.  */
 530 #define CODING_ISO_FLAG_DIRECTION       0x0100
 531
 532 /* If set, assume designation states are reset at beginning of line on
 533    output.  */
 534 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 535
 536 /* If set, designation sequence should be placed at beginning of line
 537    on output.  */
 538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 539
 540 /* If set, do not encode unsafe charactes on output.  */
 541 #define CODING_ISO_FLAG_SAFE            0x0800
 542
 543 /* If set, extra latin codes (128..159) are accepted as a valid code
 544    on input.  */
 545 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 546
 547 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 548
 549 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 550
 551 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 552
 553 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 554
 555 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 556
 557 /* A character to be produced on output if encoding of the original
 558    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 560
 561 /* UTF-8 section */
 562 #define CODING_UTF_8_BOM(coding)        \
 563   ((coding)->spec.utf_8_bom)
 564
 565 /* UTF-16 section */
 566 #define CODING_UTF_16_BOM(coding)       \
 567   ((coding)->spec.utf_16.bom)
 568
 569 #define CODING_UTF_16_ENDIAN(coding)    \
 570   ((coding)->spec.utf_16.endian)
 571
 572 #define CODING_UTF_16_SURROGATE(coding) \
 573   ((coding)->spec.utf_16.surrogate)
 574
 575
 576 /* CCL section */
 577 #define CODING_CCL_DECODER(coding)      \
 578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 579 #define CODING_CCL_ENCODER(coding)      \
 580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 581 #define CODING_CCL_VALIDS(coding)                                          \
 582   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 583
 584 /* Index for each coding category in `coding_categories' */
 585
 586 enum coding_category
 587   {
 588     coding_category_iso_7,
 589     coding_category_iso_7_tight,
 590     coding_category_iso_8_1,
 591     coding_category_iso_8_2,
 592     coding_category_iso_7_else,
 593     coding_category_iso_8_else,
 594     coding_category_utf_8_auto,
 595     coding_category_utf_8_nosig,
 596     coding_category_utf_8_sig,
 597     coding_category_utf_16_auto,
 598     coding_category_utf_16_be,
 599     coding_category_utf_16_le,
 600     coding_category_utf_16_be_nosig,
 601     coding_category_utf_16_le_nosig,
 602     coding_category_charset,
 603     coding_category_sjis,
 604     coding_category_big5,
 605     coding_category_ccl,
 606     coding_category_emacs_mule,
 607     /* All above are targets of code detection.  */
 608     coding_category_raw_text,
 609     coding_category_undecided,
 610     coding_category_max
 611   };
 612
 613 /* Definitions of flag bits used in detect_coding_XXXX.  */
 614 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 615 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 616 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 617 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 618 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 619 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 620 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 621 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 622 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 623 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8_AUTO           \
 645    | CATEGORY_MASK_UTF_8_NOSIG          \
 646    | CATEGORY_MASK_UTF_8_SIG            \
 647    | CATEGORY_MASK_UTF_16_AUTO          \
 648    | CATEGORY_MASK_UTF_16_BE            \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 652    | CATEGORY_MASK_CHARSET              \
 653    | CATEGORY_MASK_SJIS                 \
 654    | CATEGORY_MASK_BIG5                 \
 655    | CATEGORY_MASK_CCL                  \
 656    | CATEGORY_MASK_EMACS_MULE)
 657
 658
 659 #define CATEGORY_MASK_ISO_7BIT \
 660   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 661
 662 #define CATEGORY_MASK_ISO_8BIT \
 663   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 664
 665 #define CATEGORY_MASK_ISO_ELSE \
 666   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 667
 668 #define CATEGORY_MASK_ISO_ESCAPE        \
 669   (CATEGORY_MASK_ISO_7                  \
 670    | CATEGORY_MASK_ISO_7_TIGHT          \
 671    | CATEGORY_MASK_ISO_7_ELSE           \
 672    | CATEGORY_MASK_ISO_8_ELSE)
 673
 674 #define CATEGORY_MASK_ISO       \
 675   (  CATEGORY_MASK_ISO_7BIT     \
 676      | CATEGORY_MASK_ISO_8BIT   \
 677      | CATEGORY_MASK_ISO_ELSE)
 678
 679 #define CATEGORY_MASK_UTF_16            \
 680   (CATEGORY_MASK_UTF_16_AUTO            \
 681    | CATEGORY_MASK_UTF_16_BE            \
 682    | CATEGORY_MASK_UTF_16_LE            \
 683    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 684    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 685
 686 #define CATEGORY_MASK_UTF_8     \
 687   (CATEGORY_MASK_UTF_8_AUTO     \
 688    | CATEGORY_MASK_UTF_8_NOSIG  \
 689    | CATEGORY_MASK_UTF_8_SIG)
 690
 691 /* List of symbols `coding-category-xxx' ordered by priority.  This
 692    variable is exposed to Emacs Lisp.  */
 693 static Lisp_Object Vcoding_category_list;
 694
 695 /* Table of coding categories (Lisp symbols).  This variable is for
 696    internal use oly.  */
 697 static Lisp_Object Vcoding_category_table;
 698
 699 /* Table of coding-categories ordered by priority.  */
 700 static enum coding_category coding_priorities[coding_category_max];
 701
 702 /* Nth element is a coding context for the coding system bound to the
 703    Nth coding category.  */
 704 static struct coding_system coding_categories[coding_category_max];
 705
 706 /*** Commonly used macros and functions ***/
 707
 708 #ifndef min
 709 #define min(a, b) ((a) < (b) ? (a) : (b))
 710 #endif
 711 #ifndef max
 712 #define max(a, b) ((a) > (b) ? (a) : (b))
 713 #endif
 714
 715 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 716   do {                                                  \
 717     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 718     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 719   } while (0)
 720
 721
 722 /* Safely get one byte from the source text pointed by SRC which ends
 723    at SRC_END, and set C to that byte.  If there are not enough bytes
 724    in the source, it jumps to `no_more_source'.  If multibytep is
 725    nonzero, and a multibyte character is found at SRC, set C to the
 726    negative value of the character code.  The caller should declare
 727    and set these variables appropriately in advance:
 728         src, src_end, multibytep */
 729
 730 #define ONE_MORE_BYTE(c)                                \
 731   do {                                                  \
 732     if (src == src_end)                                 \
 733       {                                                 \
 734         if (src_base < src)                             \
 735           record_conversion_result                      \
 736             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 737         goto no_more_source;                            \
 738       }                                                 \
 739     c = *src++;                                         \
 740     if (multibytep && (c & 0x80))                       \
 741       {                                                 \
 742         if ((c & 0xFE) == 0xC0)                         \
 743           c = ((c & 1) << 6) | *src++;                  \
 744         else                                            \
 745           {                                             \
 746             src--;                                      \
 747             c = - string_char (src, &src, NULL);        \
 748             record_conversion_result                    \
 749               (coding, CODING_RESULT_INVALID_SRC);      \
 750           }                                             \
 751       }                                                 \
 752     consumed_chars++;                                   \
 753   } while (0)
 754
 755 /* Safely get two bytes from the source text pointed by SRC which ends
 756    at SRC_END, and set C1 and C2 to those bytes while skipping the
 757    heading multibyte characters.  If there are not enough bytes in the
 758    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 759    a multibyte character is found for C2, set C2 to the negative value
 760    of the character code.  The caller should declare and set these
 761    variables appropriately in advance:
 762         src, src_end, multibytep
 763    It is intended that this macro is used in detect_coding_utf_16.  */
 764
 765 #define TWO_MORE_BYTES(c1, c2)                          \
 766   do {                                                  \
 767     do {                                                \
 768       if (src == src_end)                               \
 769         goto no_more_source;                            \
 770       c1 = *src++;                                      \
 771       if (multibytep && (c1 & 0x80))                    \
 772         {                                               \
 773           if ((c1 & 0xFE) == 0xC0)                      \
 774             c1 = ((c1 & 1) << 6) | *src++;              \
 775           else                                          \
 776             {                                           \
 777               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 778               c1 = -1;                                  \
 779             }                                           \
 780         }                                               \
 781     } while (c1 < 0);                                   \
 782     if (src == src_end)                                 \
 783       goto no_more_source;                              \
 784     c2 = *src++;                                        \
 785     if (multibytep && (c2 & 0x80))                      \
 786       {                                                 \
 787         if ((c2 & 0xFE) == 0xC0)                        \
 788           c2 = ((c2 & 1) << 6) | *src++;                \
 789         else                                            \
 790           c2 = -1;                                      \
 791       }                                                 \
 792   } while (0)
 793
 794
 795 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 796   do {                                                  \
 797     c = *src++;                                         \
 798     if (multibytep && (c & 0x80))                       \
 799       {                                                 \
 800         if ((c & 0xFE) == 0xC0)                         \
 801           c = ((c & 1) << 6) | *src++;                  \
 802         else                                            \
 803           {                                             \
 804             src--;                                      \
 805             c = - string_char (src, &src, NULL);        \
 806             record_conversion_result                    \
 807               (coding, CODING_RESULT_INVALID_SRC);      \
 808           }                                             \
 809       }                                                 \
 810     consumed_chars++;                                   \
 811   } while (0)
 812
 813
 814 /* Store a byte C in the place pointed by DST and increment DST to the
 815    next free point, and increment PRODUCED_CHARS.  The caller should
 816    assure that C is 0..127, and declare and set the variable `dst'
 817    appropriately in advance.
 818 */
 819
 820
 821 #define EMIT_ONE_ASCII_BYTE(c)  \
 822   do {                          \
 823     produced_chars++;           \
 824     *dst++ = (c);               \
 825   } while (0)
 826
 827
 828 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 829
 830 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 831   do {                                  \
 832     produced_chars += 2;                \
 833     *dst++ = (c1), *dst++ = (c2);       \
 834   } while (0)
 835
 836
 837 /* Store a byte C in the place pointed by DST and increment DST to the
 838    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 839    nonzero, store in an appropriate multibyte from.  The caller should
 840    declare and set the variables `dst' and `multibytep' appropriately
 841    in advance.  */
 842
 843 #define EMIT_ONE_BYTE(c)                \
 844   do {                                  \
 845     produced_chars++;                   \
 846     if (multibytep)                     \
 847       {                                 \
 848         int ch = (c);                   \
 849         if (ch >= 0x80)                 \
 850           ch = BYTE8_TO_CHAR (ch);      \
 851         CHAR_STRING_ADVANCE (ch, dst);  \
 852       }                                 \
 853     else                                \
 854       *dst++ = (c);                     \
 855   } while (0)
 856
 857
 858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 859
 860 #define EMIT_TWO_BYTES(c1, c2)          \
 861   do {                                  \
 862     produced_chars += 2;                \
 863     if (multibytep)                     \
 864       {                                 \
 865         int ch;                         \
 866                                         \
 867         ch = (c1);                      \
 868         if (ch >= 0x80)                 \
 869           ch = BYTE8_TO_CHAR (ch);      \
 870         CHAR_STRING_ADVANCE (ch, dst);  \
 871         ch = (c2);                      \
 872         if (ch >= 0x80)                 \
 873           ch = BYTE8_TO_CHAR (ch);      \
 874         CHAR_STRING_ADVANCE (ch, dst);  \
 875       }                                 \
 876     else                                \
 877       {                                 \
 878         *dst++ = (c1);                  \
 879         *dst++ = (c2);                  \
 880       }                                 \
 881   } while (0)
 882
 883
 884 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 885   do {                                  \
 886     EMIT_ONE_BYTE (c1);                 \
 887     EMIT_TWO_BYTES (c2, c3);            \
 888   } while (0)
 889
 890
 891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 892   do {                                          \
 893     EMIT_TWO_BYTES (c1, c2);                    \
 894     EMIT_TWO_BYTES (c3, c4);                    \
 895   } while (0)
 896
 897
 898 /* Prototypes for static functions.  */
 899 static void record_conversion_result P_ ((struct coding_system *coding,
 900                                           enum coding_result_code result));
 901 static int detect_coding_utf_8 P_ ((struct coding_system *,
 902                                     struct coding_detection_info *info));
 903 static void decode_coding_utf_8 P_ ((struct coding_system *));
 904 static int encode_coding_utf_8 P_ ((struct coding_system *));
 905
 906 static int detect_coding_utf_16 P_ ((struct coding_system *,
 907                                      struct coding_detection_info *info));
 908 static void decode_coding_utf_16 P_ ((struct coding_system *));
 909 static int encode_coding_utf_16 P_ ((struct coding_system *));
 910
 911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 912                                        struct coding_detection_info *info));
 913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 915
 916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 917                                          struct coding_detection_info *info));
 918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 920
 921 static int detect_coding_sjis P_ ((struct coding_system *,
 922                                    struct coding_detection_info *info));
 923 static void decode_coding_sjis P_ ((struct coding_system *));
 924 static int encode_coding_sjis P_ ((struct coding_system *));
 925
 926 static int detect_coding_big5 P_ ((struct coding_system *,
 927                                    struct coding_detection_info *info));
 928 static void decode_coding_big5 P_ ((struct coding_system *));
 929 static int encode_coding_big5 P_ ((struct coding_system *));
 930
 931 static int detect_coding_ccl P_ ((struct coding_system *,
 932                                   struct coding_detection_info *info));
 933 static void decode_coding_ccl P_ ((struct coding_system *));
 934 static int encode_coding_ccl P_ ((struct coding_system *));
 935
 936 static void decode_coding_raw_text P_ ((struct coding_system *));
 937 static int encode_coding_raw_text P_ ((struct coding_system *));
 938
 939 static void coding_set_source P_ ((struct coding_system *));
 940 static void coding_set_destination P_ ((struct coding_system *));
 941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 943                                             EMACS_INT, EMACS_INT));
 944 static unsigned char *alloc_destination P_ ((struct coding_system *,
 945                                              EMACS_INT, unsigned char *));
 946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 947 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 948                                                      int *, int *,
 949                                                      unsigned char *));
 950 static int detect_eol P_ ((const unsigned char *,
 951                            EMACS_INT, enum coding_category));
 952 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 953 static void decode_eol P_ ((struct coding_system *));
 954 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 955 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 956 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 957 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 958                                         EMACS_INT));
 959 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 960 static int decode_coding P_ ((struct coding_system *));
 961 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 962                                                       struct coding_system *,
 963                                                       int *, EMACS_INT *));
 964 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 965                                                   struct coding_system *,
 966                                                   int *, EMACS_INT *));
 967 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 968 static int encode_coding P_ ((struct coding_system *));
 969 static Lisp_Object make_conversion_work_buffer P_ ((int));
 970 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 971 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 972 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 973
 974 static void
 975 record_conversion_result (struct coding_system *coding,
 976                           enum coding_result_code result)
 977 {
 978   coding->result = result;
 979   switch (result)
 980     {
 981     case CODING_RESULT_INSUFFICIENT_SRC:
 982       Vlast_code_conversion_error = Qinsufficient_source;
 983       break;
 984     case CODING_RESULT_INCONSISTENT_EOL:
 985       Vlast_code_conversion_error = Qinconsistent_eol;
 986       break;
 987     case CODING_RESULT_INVALID_SRC:
 988       Vlast_code_conversion_error = Qinvalid_source;
 989       break;
 990     case CODING_RESULT_INTERRUPT:
 991       Vlast_code_conversion_error = Qinterrupted;
 992       break;
 993     case CODING_RESULT_INSUFFICIENT_MEM:
 994       Vlast_code_conversion_error = Qinsufficient_memory;
 995       break;
 996     case CODING_RESULT_INSUFFICIENT_DST:
 997       /* Don't record this error in Vlast_code_conversion_error
 998          because it happens just temporarily and is resolved when the
 999          whole conversion is finished.  */
1000       break;
1001     case CODING_RESULT_SUCCESS:
1002       break;
1003     default:
1004       Vlast_code_conversion_error = intern ("Unknown error");
1005     }
1006 }
1007
1008 /* This wrapper macro is used to preserve validity of pointers into
1009    buffer text across calls to decode_char, which could cause
1010    relocation of buffers if it loads a charset map, because loading a
1011    charset map allocates large structures.  */
1012 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1013   do {                                                                       \
1014     charset_map_loaded = 0;                                                  \
1015     c = DECODE_CHAR (charset, code);                                         \
1016     if (charset_map_loaded)                                                  \
1017       {                                                                      \
1018         const unsigned char *orig = coding->source;                          \
1019         EMACS_INT offset;                                                    \
1020                                                                              \
1021         coding_set_source (coding);                                          \
1022         offset = coding->source - orig;                                      \
1023         src += offset;                                                       \
1024         src_base += offset;                                                  \
1025         src_end += offset;                                                   \
1026       }                                                                      \
1027   } while (0)
1028
1029
1030 /* If there are at least BYTES length of room at dst, allocate memory
1031    for coding->destination and update dst and dst_end.  We don't have
1032    to take care of coding->source which will be relocated.  It is
1033    handled by calling coding_set_source in encode_coding.  */
1034
1035 #define ASSURE_DESTINATION(bytes)                               \
1036   do {                                                          \
1037     if (dst + (bytes) >= dst_end)                               \
1038       {                                                         \
1039         int more_bytes = charbuf_end - charbuf + (bytes);       \
1040                                                                 \
1041         dst = alloc_destination (coding, more_bytes, dst);      \
1042         dst_end = coding->destination + coding->dst_bytes;      \
1043       }                                                         \
1044   } while (0)
1045
1046
1047 /* Store multibyte form of the character C in P, and advance P to the
1048    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1049    never calls MAYBE_UNIFY_CHAR.  */
1050
1051 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1052   do {                                          \
1053     if ((c) <= MAX_1_BYTE_CHAR)                 \
1054       *(p)++ = (c);                             \
1055     else if ((c) <= MAX_2_BYTE_CHAR)            \
1056       *(p)++ = (0xC0 | ((c) >> 6)),             \
1057         *(p)++ = (0x80 | ((c) & 0x3F));         \
1058     else if ((c) <= MAX_3_BYTE_CHAR)            \
1059       *(p)++ = (0xE0 | ((c) >> 12)),            \
1060         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1061         *(p)++ = (0x80 | ((c) & 0x3F));         \
1062     else if ((c) <= MAX_4_BYTE_CHAR)            \
1063       *(p)++ = (0xF0 | (c >> 18)),              \
1064         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1065         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1066         *(p)++ = (0x80 | (c & 0x3F));           \
1067     else if ((c) <= MAX_5_BYTE_CHAR)            \
1068       *(p)++ = 0xF8,                            \
1069         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1070         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1071         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1072         *(p)++ = (0x80 | (c & 0x3F));           \
1073     else                                        \
1074       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1075   } while (0)
1076
1077
1078 /* Return the character code of character whose multibyte form is at
1079    P, and advance P to the end of the multibyte form.  This is like
1080    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1081
1082 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1083   (!((p)[0] & 0x80)                                             \
1084    ? *(p)++                                                     \
1085    : ! ((p)[0] & 0x20)                                          \
1086    ? ((p) += 2,                                                 \
1087       ((((p)[-2] & 0x1F) << 6)                                  \
1088        | ((p)[-1] & 0x3F)                                       \
1089        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1090    : ! ((p)[0] & 0x10)                                          \
1091    ? ((p) += 3,                                                 \
1092       ((((p)[-3] & 0x0F) << 12)                                 \
1093        | (((p)[-2] & 0x3F) << 6)                                \
1094        | ((p)[-1] & 0x3F)))                                     \
1095    : ! ((p)[0] & 0x08)                                          \
1096    ? ((p) += 4,                                                 \
1097       ((((p)[-4] & 0xF) << 18)                                  \
1098        | (((p)[-3] & 0x3F) << 12)                               \
1099        | (((p)[-2] & 0x3F) << 6)                                \
1100        | ((p)[-1] & 0x3F)))                                     \
1101    : ((p) += 5,                                                 \
1102       ((((p)[-4] & 0x3F) << 18)                                 \
1103        | (((p)[-3] & 0x3F) << 12)                               \
1104        | (((p)[-2] & 0x3F) << 6)                                \
1105        | ((p)[-1] & 0x3F))))
1106
1107
1108 static void
1109 coding_set_source (coding)
1110      struct coding_system *coding;
1111 {
1112   if (BUFFERP (coding->src_object))
1113     {
1114       struct buffer *buf = XBUFFER (coding->src_object);
1115
1116       if (coding->src_pos < 0)
1117         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1118       else
1119         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1120     }
1121   else if (STRINGP (coding->src_object))
1122     {
1123       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1124     }
1125   else
1126     /* Otherwise, the source is C string and is never relocated
1127        automatically.  Thus we don't have to update anything.  */
1128     ;
1129 }
1130
1131 static void
1132 coding_set_destination (coding)
1133      struct coding_system *coding;
1134 {
1135   if (BUFFERP (coding->dst_object))
1136     {
1137       if (coding->src_pos < 0)
1138         {
1139           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1140           coding->dst_bytes = (GAP_END_ADDR
1141                                - (coding->src_bytes - coding->consumed)
1142                                - coding->destination);
1143         }
1144       else
1145         {
1146           /* We are sure that coding->dst_pos_byte is before the gap
1147              of the buffer. */
1148           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1149                                  + coding->dst_pos_byte - BEG_BYTE);
1150           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1151                                - coding->destination);
1152         }
1153     }
1154   else
1155     /* Otherwise, the destination is C string and is never relocated
1156        automatically.  Thus we don't have to update anything.  */
1157     ;
1158 }
1159
1160
1161 static void
1162 coding_alloc_by_realloc (coding, bytes)
1163      struct coding_system *coding;
1164      EMACS_INT bytes;
1165 {
1166   coding->destination = (unsigned char *) xrealloc (coding->destination,
1167                                                     coding->dst_bytes + bytes);
1168   coding->dst_bytes += bytes;
1169 }
1170
1171 static void
1172 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1173      struct coding_system *coding;
1174      EMACS_INT gap_head_used, bytes;
1175 {
1176   if (EQ (coding->src_object, coding->dst_object))
1177     {
1178       /* The gap may contain the produced data at the head and not-yet
1179          consumed data at the tail.  To preserve those data, we at
1180          first make the gap size to zero, then increase the gap
1181          size.  */
1182       EMACS_INT add = GAP_SIZE;
1183
1184       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1185       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1186       make_gap (bytes);
1187       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1188       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1189     }
1190   else
1191     {
1192       Lisp_Object this_buffer;
1193
1194       this_buffer = Fcurrent_buffer ();
1195       set_buffer_internal (XBUFFER (coding->dst_object));
1196       make_gap (bytes);
1197       set_buffer_internal (XBUFFER (this_buffer));
1198     }
1199 }
1200
1201
1202 static unsigned char *
1203 alloc_destination (coding, nbytes, dst)
1204      struct coding_system *coding;
1205      EMACS_INT nbytes;
1206      unsigned char *dst;
1207 {
1208   EMACS_INT offset = dst - coding->destination;
1209
1210   if (BUFFERP (coding->dst_object))
1211     {
1212       struct buffer *buf = XBUFFER (coding->dst_object);
1213
1214       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1215     }
1216   else
1217     coding_alloc_by_realloc (coding, nbytes);
1218   coding_set_destination (coding);
1219   dst = coding->destination + offset;
1220   return dst;
1221 }
1222
1223 /** Macros for annotations.  */
1224
1225 /* An annotation data is stored in the array coding->charbuf in this
1226    format:
1227      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1228    LENGTH is the number of elements in the annotation.
1229    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1230    NCHARS is the number of characters in the text annotated.
1231
1232    The format of the following elements depend on ANNOTATION_MASK.
1233
1234    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1235    follows:
1236      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1237
1238    NBYTES is the number of bytes specified in the header part of
1239    old-style emacs-mule encoding, or 0 for the other kind of
1240    composition.
1241
1242    METHOD is one of enum composition_method.
1243
1244    Optionnal COMPOSITION-COMPONENTS are characters and composition
1245    rules.
1246
1247    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1248    follows.
1249
1250    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1251    recover from an invalid annotation, and should be skipped by
1252    produce_annotation.  */
1253
1254 /* Maximum length of the header of annotation data.  */
1255 #define MAX_ANNOTATION_LENGTH 5
1256
1257 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1258   do {                                                  \
1259     *(buf)++ = -(len);                                  \
1260     *(buf)++ = (mask);                                  \
1261     *(buf)++ = (nchars);                                \
1262     coding->annotated = 1;                              \
1263   } while (0);
1264
1265 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1266   do {                                                                      \
1267     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1268     *buf++ = nbytes;                                                        \
1269     *buf++ = method;                                                        \
1270   } while (0)
1271
1272
1273 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1274   do {                                                                  \
1275     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1276     *buf++ = id;                                                        \
1277   } while (0)
1278
1279 \f
1280 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1281
1282
1283
1284 \f
1285 /*** 3. UTF-8 ***/
1286
1287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1288    Check if a text is encoded in UTF-8.  If it is, return 1, else
1289    return 0.  */
1290
1291 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1292 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1293 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1294 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1295 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1296 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1297
1298 #define UTF_BOM 0xFEFF
1299 #define UTF_8_BOM_1 0xEF
1300 #define UTF_8_BOM_2 0xBB
1301 #define UTF_8_BOM_3 0xBF
1302
1303 static int
1304 detect_coding_utf_8 (coding, detect_info)
1305      struct coding_system *coding;
1306      struct coding_detection_info *detect_info;
1307 {
1308   const unsigned char *src = coding->source, *src_base;
1309   const unsigned char *src_end = coding->source + coding->src_bytes;
1310   int multibytep = coding->src_multibyte;
1311   int consumed_chars = 0;
1312   int bom_found = 0;
1313   int found = 0;
1314
1315   detect_info->checked |= CATEGORY_MASK_UTF_8;
1316   /* A coding system of this category is always ASCII compatible.  */
1317   src += coding->head_ascii;
1318
1319   while (1)
1320     {
1321       int c, c1, c2, c3, c4;
1322
1323       src_base = src;
1324       ONE_MORE_BYTE (c);
1325       if (c < 0 || UTF_8_1_OCTET_P (c))
1326         continue;
1327       ONE_MORE_BYTE (c1);
1328       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1329         break;
1330       if (UTF_8_2_OCTET_LEADING_P (c))
1331         {
1332           found = 1;
1333           continue;
1334         }
1335       ONE_MORE_BYTE (c2);
1336       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1337         break;
1338       if (UTF_8_3_OCTET_LEADING_P (c))
1339         {
1340           found = 1;
1341           if (src_base == coding->source
1342               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1343             bom_found = 1;
1344           continue;
1345         }
1346       ONE_MORE_BYTE (c3);
1347       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1348         break;
1349       if (UTF_8_4_OCTET_LEADING_P (c))
1350         {
1351           found = 1;
1352           continue;
1353         }
1354       ONE_MORE_BYTE (c4);
1355       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1356         break;
1357       if (UTF_8_5_OCTET_LEADING_P (c))
1358         {
1359           found = 1;
1360           continue;
1361         }
1362       break;
1363     }
1364   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1365   return 0;
1366
1367  no_more_source:
1368   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1369     {
1370       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1371       return 0;
1372     }
1373   if (bom_found)
1374     {
1375       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1376       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1377     }
1378   else
1379     {
1380       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1381       if (found)
1382         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1383     }
1384   return 1;
1385 }
1386
1387
1388 static void
1389 decode_coding_utf_8 (coding)
1390      struct coding_system *coding;
1391 {
1392   const unsigned char *src = coding->source + coding->consumed;
1393   const unsigned char *src_end = coding->source + coding->src_bytes;
1394   const unsigned char *src_base;
1395   int *charbuf = coding->charbuf + coding->charbuf_used;
1396   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1397   int consumed_chars = 0, consumed_chars_base = 0;
1398   int multibytep = coding->src_multibyte;
1399   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1400   Lisp_Object attr, charset_list;
1401   int eol_crlf =
1402     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1403   int byte_after_cr = -1;
1404
1405   CODING_GET_INFO (coding, attr, charset_list);
1406
1407   if (bom != utf_without_bom)
1408     {
1409       int c1, c2, c3;
1410
1411       src_base = src;
1412       ONE_MORE_BYTE (c1);
1413       if (! UTF_8_3_OCTET_LEADING_P (c1))
1414         src = src_base;
1415       else
1416         {
1417           ONE_MORE_BYTE (c2);
1418           if (! UTF_8_EXTRA_OCTET_P (c2))
1419             src = src_base;
1420           else
1421             {
1422               ONE_MORE_BYTE (c3);
1423               if (! UTF_8_EXTRA_OCTET_P (c3))
1424                 src = src_base;
1425               else
1426                 {
1427                   if ((c1 != UTF_8_BOM_1)
1428                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1429                     src = src_base;
1430                   else
1431                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1432                 }
1433             }
1434         }
1435     }
1436   CODING_UTF_8_BOM (coding) = utf_without_bom;
1437
1438
1439
1440   while (1)
1441     {
1442       int c, c1, c2, c3, c4, c5;
1443
1444       src_base = src;
1445       consumed_chars_base = consumed_chars;
1446
1447       if (charbuf >= charbuf_end)
1448         {
1449           if (byte_after_cr >= 0)
1450             src_base--;
1451           break;
1452         }
1453
1454       if (byte_after_cr >= 0)
1455         c1 = byte_after_cr, byte_after_cr = -1;
1456       else
1457         ONE_MORE_BYTE (c1);
1458       if (c1 < 0)
1459         {
1460           c = - c1;
1461         }
1462       else if (UTF_8_1_OCTET_P(c1))
1463         {
1464           if (eol_crlf && c1 == '\r')
1465             ONE_MORE_BYTE (byte_after_cr);
1466           c = c1;
1467         }
1468       else
1469         {
1470           ONE_MORE_BYTE (c2);
1471           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1472             goto invalid_code;
1473           if (UTF_8_2_OCTET_LEADING_P (c1))
1474             {
1475               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1476               /* Reject overlong sequences here and below.  Encoders
1477                  producing them are incorrect, they can be misleading,
1478                  and they mess up read/write invariance.  */
1479               if (c < 128)
1480                 goto invalid_code;
1481             }
1482           else
1483             {
1484               ONE_MORE_BYTE (c3);
1485               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1486                 goto invalid_code;
1487               if (UTF_8_3_OCTET_LEADING_P (c1))
1488                 {
1489                   c = (((c1 & 0xF) << 12)
1490                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1491                   if (c < 0x800
1492                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1493                     goto invalid_code;
1494                 }
1495               else
1496                 {
1497                   ONE_MORE_BYTE (c4);
1498                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1499                     goto invalid_code;
1500                   if (UTF_8_4_OCTET_LEADING_P (c1))
1501                     {
1502                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1503                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1504                     if (c < 0x10000)
1505                       goto invalid_code;
1506                     }
1507                   else
1508                     {
1509                       ONE_MORE_BYTE (c5);
1510                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1511                         goto invalid_code;
1512                       if (UTF_8_5_OCTET_LEADING_P (c1))
1513                         {
1514                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1515                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1516                                | (c5 & 0x3F));
1517                           if ((c > MAX_CHAR) || (c < 0x200000))
1518                             goto invalid_code;
1519                         }
1520                       else
1521                         goto invalid_code;
1522                     }
1523                 }
1524             }
1525         }
1526
1527       *charbuf++ = c;
1528       continue;
1529
1530     invalid_code:
1531       src = src_base;
1532       consumed_chars = consumed_chars_base;
1533       ONE_MORE_BYTE (c);
1534       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1535       coding->errors++;
1536     }
1537
1538  no_more_source:
1539   coding->consumed_char += consumed_chars_base;
1540   coding->consumed = src_base - coding->source;
1541   coding->charbuf_used = charbuf - coding->charbuf;
1542 }
1543
1544
1545 static int
1546 encode_coding_utf_8 (coding)
1547      struct coding_system *coding;
1548 {
1549   int multibytep = coding->dst_multibyte;
1550   int *charbuf = coding->charbuf;
1551   int *charbuf_end = charbuf + coding->charbuf_used;
1552   unsigned char *dst = coding->destination + coding->produced;
1553   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1554   int produced_chars = 0;
1555   int c;
1556
1557   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1558     {
1559       ASSURE_DESTINATION (3);
1560       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1561       CODING_UTF_8_BOM (coding) = utf_without_bom;
1562     }
1563
1564   if (multibytep)
1565     {
1566       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1567
1568       while (charbuf < charbuf_end)
1569         {
1570           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1571
1572           ASSURE_DESTINATION (safe_room);
1573           c = *charbuf++;
1574           if (CHAR_BYTE8_P (c))
1575             {
1576               c = CHAR_TO_BYTE8 (c);
1577               EMIT_ONE_BYTE (c);
1578             }
1579           else
1580             {
1581               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1582               for (p = str; p < pend; p++)
1583                 EMIT_ONE_BYTE (*p);
1584             }
1585         }
1586     }
1587   else
1588     {
1589       int safe_room = MAX_MULTIBYTE_LENGTH;
1590
1591       while (charbuf < charbuf_end)
1592         {
1593           ASSURE_DESTINATION (safe_room);
1594           c = *charbuf++;
1595           if (CHAR_BYTE8_P (c))
1596             *dst++ = CHAR_TO_BYTE8 (c);
1597           else
1598             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1599           produced_chars++;
1600         }
1601     }
1602   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1603   coding->produced_char += produced_chars;
1604   coding->produced = dst - coding->destination;
1605   return 0;
1606 }
1607
1608
1609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1610    Check if a text is encoded in one of UTF-16 based coding systems.
1611    If it is, return 1, else return 0.  */
1612
1613 #define UTF_16_HIGH_SURROGATE_P(val) \
1614   (((val) & 0xFC00) == 0xD800)
1615
1616 #define UTF_16_LOW_SURROGATE_P(val) \
1617   (((val) & 0xFC00) == 0xDC00)
1618
1619 #define UTF_16_INVALID_P(val)   \
1620   (((val) == 0xFFFE)            \
1621    || ((val) == 0xFFFF)         \
1622    || UTF_16_LOW_SURROGATE_P (val))
1623
1624
1625 static int
1626 detect_coding_utf_16 (coding, detect_info)
1627      struct coding_system *coding;
1628      struct coding_detection_info *detect_info;
1629 {
1630   const unsigned char *src = coding->source, *src_base = src;
1631   const unsigned char *src_end = coding->source + coding->src_bytes;
1632   int multibytep = coding->src_multibyte;
1633   int consumed_chars = 0;
1634   int c1, c2;
1635
1636   detect_info->checked |= CATEGORY_MASK_UTF_16;
1637   if (coding->mode & CODING_MODE_LAST_BLOCK
1638       && (coding->src_chars & 1))
1639     {
1640       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1641       return 0;
1642     }
1643
1644   TWO_MORE_BYTES (c1, c2);
1645   if ((c1 == 0xFF) && (c2 == 0xFE))
1646     {
1647       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1648                              | CATEGORY_MASK_UTF_16_AUTO);
1649       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1650                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1651                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1652     }
1653   else if ((c1 == 0xFE) && (c2 == 0xFF))
1654     {
1655       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1656                              | CATEGORY_MASK_UTF_16_AUTO);
1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1658                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1659                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1660     }
1661   else if (c2 < 0)
1662     {
1663       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1664       return 0;
1665     }
1666   else
1667     {
1668       /* We check the dispersion of Eth and Oth bytes where E is even and
1669          O is odd.  If both are high, we assume binary data.*/
1670       unsigned char e[256], o[256];
1671       unsigned e_num = 1, o_num = 1;
1672
1673       memset (e, 0, 256);
1674       memset (o, 0, 256);
1675       e[c1] = 1;
1676       o[c2] = 1;
1677
1678       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1679                                 |CATEGORY_MASK_UTF_16_BE
1680                                 | CATEGORY_MASK_UTF_16_LE);
1681
1682       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1683              != CATEGORY_MASK_UTF_16)
1684         {
1685           TWO_MORE_BYTES (c1, c2);
1686           if (c2 < 0)
1687             break;
1688           if (! e[c1])
1689             {
1690               e[c1] = 1;
1691               e_num++;
1692               if (e_num >= 128)
1693                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1694             }
1695           if (! o[c2])
1696             {
1697               o[c2] = 1;
1698               o_num++;
1699               if (o_num >= 128)
1700                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1701             }
1702         }
1703       return 0;
1704     }
1705
1706  no_more_source:
1707   return 1;
1708 }
1709
1710 static void
1711 decode_coding_utf_16 (coding)
1712      struct coding_system *coding;
1713 {
1714   const unsigned char *src = coding->source + coding->consumed;
1715   const unsigned char *src_end = coding->source + coding->src_bytes;
1716   const unsigned char *src_base;
1717   int *charbuf = coding->charbuf + coding->charbuf_used;
1718   /* We may produces at most 3 chars in one loop.  */
1719   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1720   int consumed_chars = 0, consumed_chars_base = 0;
1721   int multibytep = coding->src_multibyte;
1722   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1723   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1724   int surrogate = CODING_UTF_16_SURROGATE (coding);
1725   Lisp_Object attr, charset_list;
1726   int eol_crlf =
1727     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1728   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1729
1730   CODING_GET_INFO (coding, attr, charset_list);
1731
1732   if (bom == utf_with_bom)
1733     {
1734       int c, c1, c2;
1735
1736       src_base = src;
1737       ONE_MORE_BYTE (c1);
1738       ONE_MORE_BYTE (c2);
1739       c = (c1 << 8) | c2;
1740
1741       if (endian == utf_16_big_endian
1742           ? c != 0xFEFF : c != 0xFFFE)
1743         {
1744           /* The first two bytes are not BOM.  Treat them as bytes
1745              for a normal character.  */
1746           src = src_base;
1747           coding->errors++;
1748         }
1749       CODING_UTF_16_BOM (coding) = utf_without_bom;
1750     }
1751   else if (bom == utf_detect_bom)
1752     {
1753       /* We have already tried to detect BOM and failed in
1754          detect_coding.  */
1755       CODING_UTF_16_BOM (coding) = utf_without_bom;
1756     }
1757
1758   while (1)
1759     {
1760       int c, c1, c2;
1761
1762       src_base = src;
1763       consumed_chars_base = consumed_chars;
1764
1765       if (charbuf >= charbuf_end)
1766         {
1767           if (byte_after_cr1 >= 0)
1768             src_base -= 2;
1769           break;
1770         }
1771
1772       if (byte_after_cr1 >= 0)
1773         c1 = byte_after_cr1, byte_after_cr1 = -1;
1774       else
1775         ONE_MORE_BYTE (c1);
1776       if (c1 < 0)
1777         {
1778           *charbuf++ = -c1;
1779           continue;
1780         }
1781       if (byte_after_cr2 >= 0)
1782         c2 = byte_after_cr2, byte_after_cr2 = -1;
1783       else
1784         ONE_MORE_BYTE (c2);
1785       if (c2 < 0)
1786         {
1787           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1788           *charbuf++ = -c2;
1789           continue;
1790         }
1791       c = (endian == utf_16_big_endian
1792            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1793
1794       if (surrogate)
1795         {
1796           if (! UTF_16_LOW_SURROGATE_P (c))
1797             {
1798               if (endian == utf_16_big_endian)
1799                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1800               else
1801                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1802               *charbuf++ = c1;
1803               *charbuf++ = c2;
1804               coding->errors++;
1805               if (UTF_16_HIGH_SURROGATE_P (c))
1806                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1807               else
1808                 *charbuf++ = c;
1809             }
1810           else
1811             {
1812               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1813               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1814               *charbuf++ = 0x10000 + c;
1815             }
1816         }
1817       else
1818         {
1819           if (UTF_16_HIGH_SURROGATE_P (c))
1820             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1821           else
1822             {
1823               if (eol_crlf && c == '\r')
1824                 {
1825                   ONE_MORE_BYTE (byte_after_cr1);
1826                   ONE_MORE_BYTE (byte_after_cr2);
1827                 }
1828               *charbuf++ = c;
1829             }
1830         }
1831     }
1832
1833  no_more_source:
1834   coding->consumed_char += consumed_chars_base;
1835   coding->consumed = src_base - coding->source;
1836   coding->charbuf_used = charbuf - coding->charbuf;
1837 }
1838
1839 static int
1840 encode_coding_utf_16 (coding)
1841      struct coding_system *coding;
1842 {
1843   int multibytep = coding->dst_multibyte;
1844   int *charbuf = coding->charbuf;
1845   int *charbuf_end = charbuf + coding->charbuf_used;
1846   unsigned char *dst = coding->destination + coding->produced;
1847   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1848   int safe_room = 8;
1849   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1850   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1851   int produced_chars = 0;
1852   Lisp_Object attrs, charset_list;
1853   int c;
1854
1855   CODING_GET_INFO (coding, attrs, charset_list);
1856
1857   if (bom != utf_without_bom)
1858     {
1859       ASSURE_DESTINATION (safe_room);
1860       if (big_endian)
1861         EMIT_TWO_BYTES (0xFE, 0xFF);
1862       else
1863         EMIT_TWO_BYTES (0xFF, 0xFE);
1864       CODING_UTF_16_BOM (coding) = utf_without_bom;
1865     }
1866
1867   while (charbuf < charbuf_end)
1868     {
1869       ASSURE_DESTINATION (safe_room);
1870       c = *charbuf++;
1871       if (c > MAX_UNICODE_CHAR)
1872         c = coding->default_char;
1873
1874       if (c < 0x10000)
1875         {
1876           if (big_endian)
1877             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1878           else
1879             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1880         }
1881       else
1882         {
1883           int c1, c2;
1884
1885           c -= 0x10000;
1886           c1 = (c >> 10) + 0xD800;
1887           c2 = (c & 0x3FF) + 0xDC00;
1888           if (big_endian)
1889             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1890           else
1891             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1892         }
1893     }
1894   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1895   coding->produced = dst - coding->destination;
1896   coding->produced_char += produced_chars;
1897   return 0;
1898 }
1899
1900 \f
1901 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1902
1903 /* Emacs' internal format for representation of multiple character
1904    sets is a kind of multi-byte encoding, i.e. characters are
1905    represented by variable-length sequences of one-byte codes.
1906
1907    ASCII characters and control characters (e.g. `tab', `newline') are
1908    represented by one-byte sequences which are their ASCII codes, in
1909    the range 0x00 through 0x7F.
1910
1911    8-bit characters of the range 0x80..0x9F are represented by
1912    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1913    code + 0x20).
1914
1915    8-bit characters of the range 0xA0..0xFF are represented by
1916    one-byte sequences which are their 8-bit code.
1917
1918    The other characters are represented by a sequence of `base
1919    leading-code', optional `extended leading-code', and one or two
1920    `position-code's.  The length of the sequence is determined by the
1921    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1922    whereas extended leading-code and position-code take the range 0xA0
1923    through 0xFF.  See `charset.h' for more details about leading-code
1924    and position-code.
1925
1926    --- CODE RANGE of Emacs' internal format ---
1927    character set        range
1928    -------------        -----
1929    ascii                0x00..0x7F
1930    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1931    eight-bit-graphic    0xA0..0xBF
1932    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1933    ---------------------------------------------
1934
1935    As this is the internal character representation, the format is
1936    usually not used externally (i.e. in a file or in a data sent to a
1937    process).  But, it is possible to have a text externally in this
1938    format (i.e. by encoding by the coding system `emacs-mule').
1939
1940    In that case, a sequence of one-byte codes has a slightly different
1941    form.
1942
1943    At first, all characters in eight-bit-control are represented by
1944    one-byte sequences which are their 8-bit code.
1945
1946    Next, character composition data are represented by the byte
1947    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1948    where,
1949         METHOD is 0xF2 plus one of composition method (enum
1950         composition_method),
1951
1952         BYTES is 0xA0 plus a byte length of this composition data,
1953
1954         CHARS is 0xA0 plus a number of characters composed by this
1955         data,
1956
1957         COMPONENTs are characters of multibye form or composition
1958         rules encoded by two-byte of ASCII codes.
1959
1960    In addition, for backward compatibility, the following formats are
1961    also recognized as composition data on decoding.
1962
1963    0x80 MSEQ ...
1964    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1965
1966    Here,
1967         MSEQ is a multibyte form but in these special format:
1968           ASCII: 0xA0 ASCII_CODE+0x80,
1969           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1970         RULE is a one byte code of the range 0xA0..0xF0 that
1971         represents a composition rule.
1972   */
1973
1974 char emacs_mule_bytes[256];
1975
1976
1977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1978    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1979    else return 0.  */
1980
1981 static int
1982 detect_coding_emacs_mule (coding, detect_info)
1983      struct coding_system *coding;
1984      struct coding_detection_info *detect_info;
1985 {
1986   const unsigned char *src = coding->source, *src_base;
1987   const unsigned char *src_end = coding->source + coding->src_bytes;
1988   int multibytep = coding->src_multibyte;
1989   int consumed_chars = 0;
1990   int c;
1991   int found = 0;
1992
1993   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1994   /* A coding system of this category is always ASCII compatible.  */
1995   src += coding->head_ascii;
1996
1997   while (1)
1998     {
1999       src_base = src;
2000       ONE_MORE_BYTE (c);
2001       if (c < 0)
2002         continue;
2003       if (c == 0x80)
2004         {
2005           /* Perhaps the start of composite character.  We simply skip
2006              it because analyzing it is too heavy for detecting.  But,
2007              at least, we check that the composite character
2008              constitutes of more than 4 bytes.  */
2009           const unsigned char *src_base;
2010
2011         repeat:
2012           src_base = src;
2013           do
2014             {
2015               ONE_MORE_BYTE (c);
2016             }
2017           while (c >= 0xA0);
2018
2019           if (src - src_base <= 4)
2020             break;
2021           found = CATEGORY_MASK_EMACS_MULE;
2022           if (c == 0x80)
2023             goto repeat;
2024         }
2025
2026       if (c < 0x80)
2027         {
2028           if (c < 0x20
2029               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2030             break;
2031         }
2032       else
2033         {
2034           int more_bytes = emacs_mule_bytes[*src_base] - 1;
2035
2036           while (more_bytes > 0)
2037             {
2038               ONE_MORE_BYTE (c);
2039               if (c < 0xA0)
2040                 {
2041                   src--;        /* Unread the last byte.  */
2042                   break;
2043                 }
2044               more_bytes--;
2045             }
2046           if (more_bytes != 0)
2047             break;
2048           found = CATEGORY_MASK_EMACS_MULE;
2049         }
2050     }
2051   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2052   return 0;
2053
2054  no_more_source:
2055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2056     {
2057       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2058       return 0;
2059     }
2060   detect_info->found |= found;
2061   return 1;
2062 }
2063
2064
2065 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2066    character.  If CMP_STATUS indicates that we must expect MSEQ or
2067    RULE described above, decode it and return the negative value of
2068    the decoded character or rule.  If an invalid byte is found, return
2069    -1.  If SRC is too short, return -2.  */
2070
2071 int
2072 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2073      struct coding_system *coding;
2074      const unsigned char *src;
2075      int *nbytes, *nchars, *id;
2076      struct composition_status *cmp_status;
2077 {
2078   const unsigned char *src_end = coding->source + coding->src_bytes;
2079   const unsigned char *src_base = src;
2080   int multibytep = coding->src_multibyte;
2081   struct charset *charset;
2082   unsigned code;
2083   int c;
2084   int consumed_chars = 0;
2085   int mseq_found = 0;
2086
2087   ONE_MORE_BYTE (c);
2088   if (c < 0)
2089     {
2090       c = -c;
2091       charset = emacs_mule_charset[0];
2092     }
2093   else
2094     {
2095       if (c >= 0xA0)
2096         {
2097           if (cmp_status->state != COMPOSING_NO
2098               && cmp_status->old_form)
2099             {
2100               if (cmp_status->state == COMPOSING_CHAR)
2101                 {
2102                   if (c == 0xA0)
2103                     {
2104                       ONE_MORE_BYTE (c);
2105                       c -= 0x80;
2106                       if (c < 0)
2107                         goto invalid_code;
2108                     }
2109                   else
2110                     c -= 0x20;
2111                   mseq_found = 1;
2112                 }
2113               else
2114                 {
2115                   *nbytes = src - src_base;
2116                   *nchars = consumed_chars;
2117                   return -c;
2118                 }
2119             }
2120           else
2121             goto invalid_code;
2122         }
2123
2124       switch (emacs_mule_bytes[c])
2125         {
2126         case 2:
2127           if (! (charset = emacs_mule_charset[c]))
2128             goto invalid_code;
2129           ONE_MORE_BYTE (c);
2130           if (c < 0xA0)
2131             goto invalid_code;
2132           code = c & 0x7F;
2133           break;
2134
2135         case 3:
2136           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2137               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2138             {
2139               ONE_MORE_BYTE (c);
2140               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2141                 goto invalid_code;
2142               ONE_MORE_BYTE (c);
2143               if (c < 0xA0)
2144                 goto invalid_code;
2145               code = c & 0x7F;
2146             }
2147           else
2148             {
2149               if (! (charset = emacs_mule_charset[c]))
2150                 goto invalid_code;
2151               ONE_MORE_BYTE (c);
2152               if (c < 0xA0)
2153                 goto invalid_code;
2154               code = (c & 0x7F) << 8;
2155               ONE_MORE_BYTE (c);
2156               if (c < 0xA0)
2157                 goto invalid_code;
2158               code |= c & 0x7F;
2159             }
2160           break;
2161
2162         case 4:
2163           ONE_MORE_BYTE (c);
2164           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2165             goto invalid_code;
2166           ONE_MORE_BYTE (c);
2167           if (c < 0xA0)
2168             goto invalid_code;
2169           code = (c & 0x7F) << 8;
2170           ONE_MORE_BYTE (c);
2171           if (c < 0xA0)
2172             goto invalid_code;
2173           code |= c & 0x7F;
2174           break;
2175
2176         case 1:
2177           code = c;
2178           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2179                                      ? charset_ascii : charset_eight_bit);
2180           break;
2181
2182         default:
2183           abort ();
2184         }
2185       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, code, c);
2186       if (c < 0)
2187         goto invalid_code;
2188     }
2189   *nbytes = src - src_base;
2190   *nchars = consumed_chars;
2191   if (id)
2192     *id = charset->id;
2193   return (mseq_found ? -c : c);
2194
2195  no_more_source:
2196   return -2;
2197
2198  invalid_code:
2199   return -1;
2200 }
2201
2202
2203 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2204
2205 /* Handle these composition sequence ('|': the end of header elements,
2206    BYTES and CHARS >= 0xA0):
2207
2208    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2209    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2210    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2211
2212    and these old form:
2213
2214    (4) relative composition: 0x80 | MSEQ ... MSEQ
2215    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2216
2217    When the starter 0x80 and the following header elements are found,
2218    this annotation header is produced.
2219
2220         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2221
2222    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2223    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2224
2225    Then, upon reading the following elements, these codes are produced
2226    until the composition end is found:
2227
2228    (1) CHAR ... CHAR
2229    (2) ALT ... ALT CHAR ... CHAR
2230    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2231    (4) CHAR ... CHAR
2232    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2233
2234    When the composition end is found, LENGTH and NCHARS in the
2235    annotation header is updated as below:
2236
2237    (1) LENGTH: unchanged, NCHARS: unchanged
2238    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2239    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2240    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2241    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2242
2243    If an error is found while composing, the annotation header is
2244    changed to the original composition header (plus filler -1s) as
2245    below:
2246
2247    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2248    (5)          [ 0x80 0xFF -1 -1- -1 ]
2249
2250    and the sequence [ -2 DECODED-RULE ] is changed to the original
2251    byte sequence as below:
2252         o the original byte sequence is B: [ B -1 ]
2253         o the original byte sequence is B1 B2: [ B1 B2 ]
2254
2255    Most of the routines are implemented by macros because many
2256    variables and labels in the caller decode_coding_emacs_mule must be
2257    accessible, and they are usually called just once (thus doesn't
2258    increase the size of compiled object).  */
2259
2260 /* Decode a composition rule represented by C as a component of
2261    composition sequence of Emacs 20 style.  Set RULE to the decoded
2262    rule. */
2263
2264 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2265   do {                                                  \
2266     int gref, nref;                                     \
2267                                                         \
2268     c -= 0xA0;                                          \
2269     if (c < 0 || c >= 81)                               \
2270       goto invalid_code;                                \
2271     gref = c / 9, nref = c % 9;                         \
2272     if (gref == 4) gref = 10;                           \
2273     if (nref == 4) nref = 10;                           \
2274     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2275   } while (0)
2276
2277
2278 /* Decode a composition rule represented by C and the following byte
2279    at SRC as a component of composition sequence of Emacs 21 style.
2280    Set RULE to the decoded rule.  */
2281
2282 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2283   do {                                                  \
2284     int gref, nref;                                     \
2285                                                         \
2286     gref = c - 0x20;                                    \
2287     if (gref < 0 || gref >= 81)                         \
2288       goto invalid_code;                                \
2289     ONE_MORE_BYTE (c);                                  \
2290     nref = c - 0x20;                                    \
2291     if (nref < 0 || nref >= 81)                         \
2292       goto invalid_code;                                \
2293     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2294   } while (0)
2295
2296
2297 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2298    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2299    byte length of this composition information, CHARS is the number of
2300    characters composed by this composition.  */
2301
2302 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2303   do {                                                                  \
2304     enum composition_method method = c - 0xF2;                          \
2305     int *charbuf_base = charbuf;                                        \
2306     int nbytes, nchars;                                                 \
2307                                                                         \
2308     ONE_MORE_BYTE (c);                                                  \
2309     if (c < 0)                                                          \
2310       goto invalid_code;                                                \
2311     nbytes = c - 0xA0;                                                  \
2312     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2313       goto invalid_code;                                                \
2314     ONE_MORE_BYTE (c);                                                  \
2315     nchars = c - 0xA0;                                                  \
2316     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2317       goto invalid_code;                                                \
2318     cmp_status->old_form = 0;                                           \
2319     cmp_status->method = method;                                        \
2320     if (method == COMPOSITION_RELATIVE)                                 \
2321       cmp_status->state = COMPOSING_CHAR;                               \
2322     else                                                                \
2323       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2325     cmp_status->nchars = nchars;                                        \
2326     cmp_status->ncomps = nbytes - 4;                                    \
2327     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2328   } while (0)
2329
2330
2331 /* Start of Emacs 20 style format for relative composition.  */
2332
2333 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2334   do {                                                          \
2335     cmp_status->old_form = 1;                                   \
2336     cmp_status->method = COMPOSITION_RELATIVE;                  \
2337     cmp_status->state = COMPOSING_CHAR;                         \
2338     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2339     cmp_status->nchars = cmp_status->ncomps = 0;                \
2340     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2341   } while (0)
2342
2343
2344 /* Start of Emacs 20 style format for rule-base composition.  */
2345
2346 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2347   do {                                                          \
2348     cmp_status->old_form = 1;                                   \
2349     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2350     cmp_status->state = COMPOSING_CHAR;                         \
2351     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2352     cmp_status->nchars = cmp_status->ncomps = 0;                \
2353     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2354   } while (0)
2355
2356
2357 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2358   do {                                                  \
2359     const unsigned char *current_src = src;             \
2360                                                         \
2361     ONE_MORE_BYTE (c);                                  \
2362     if (c < 0)                                          \
2363       goto invalid_code;                                \
2364     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2365         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2366       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2367     else if (c < 0xA0)                                  \
2368       goto invalid_code;                                \
2369     else if (c < 0xC0)                                  \
2370       {                                                 \
2371         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2372         /* Re-read C as a composition component.  */    \
2373         src = current_src;                              \
2374       }                                                 \
2375     else if (c == 0xFF)                                 \
2376       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2377     else                                                \
2378       goto invalid_code;                                \
2379   } while (0)
2380
2381 #define EMACS_MULE_COMPOSITION_END()                            \
2382   do {                                                          \
2383     int idx = - cmp_status->length;                             \
2384                                                                 \
2385     if (cmp_status->old_form)                                   \
2386       charbuf[idx + 2] = cmp_status->nchars;                    \
2387     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2388       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2389     cmp_status->state = COMPOSING_NO;                           \
2390   } while (0)
2391
2392
2393 static int
2394 emacs_mule_finish_composition (charbuf, cmp_status)
2395      int *charbuf;
2396      struct composition_status *cmp_status;
2397 {
2398   int idx = - cmp_status->length;
2399   int new_chars;
2400
2401   if (cmp_status->old_form && cmp_status->nchars > 0)
2402     {
2403       charbuf[idx + 2] = cmp_status->nchars;
2404       new_chars = 0;
2405       if (cmp_status->method == COMPOSITION_WITH_RULE
2406           && cmp_status->state == COMPOSING_CHAR)
2407         {
2408           /* The last rule was invalid.  */
2409           int rule = charbuf[-1] + 0xA0;
2410
2411           charbuf[-2] = BYTE8_TO_CHAR (rule);
2412           charbuf[-1] = -1;
2413           new_chars = 1;
2414         }
2415     }
2416   else
2417     {
2418       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2419
2420       if (cmp_status->method == COMPOSITION_WITH_RULE)
2421         {
2422           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2423           charbuf[idx++] = -3;
2424           charbuf[idx++] = 0;
2425           new_chars = 1;
2426         }
2427       else
2428         {
2429           int nchars = charbuf[idx + 1] + 0xA0;
2430           int nbytes = charbuf[idx + 2] + 0xA0;
2431
2432           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2433           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2434           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2435           charbuf[idx++] = -1;
2436           new_chars = 4;
2437         }
2438     }
2439   cmp_status->state = COMPOSING_NO;
2440   return new_chars;
2441 }
2442
2443 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2444   do {                                                                    \
2445     if (cmp_status->state != COMPOSING_NO)                                \
2446       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2447   } while (0)
2448
2449
2450 static void
2451 decode_coding_emacs_mule (coding)
2452      struct coding_system *coding;
2453 {
2454   const unsigned char *src = coding->source + coding->consumed;
2455   const unsigned char *src_end = coding->source + coding->src_bytes;
2456   const unsigned char *src_base;
2457   int *charbuf = coding->charbuf + coding->charbuf_used;
2458   /* We may produce two annocations (charset and composition) in one
2459      loop and one more charset annocation at the end.  */
2460   int *charbuf_end
2461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2462   int consumed_chars = 0, consumed_chars_base;
2463   int multibytep = coding->src_multibyte;
2464   Lisp_Object attrs, charset_list;
2465   int char_offset = coding->produced_char;
2466   int last_offset = char_offset;
2467   int last_id = charset_ascii;
2468   int eol_crlf =
2469     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2470   int byte_after_cr = -1;
2471   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2472
2473   CODING_GET_INFO (coding, attrs, charset_list);
2474
2475   if (cmp_status->state != COMPOSING_NO)
2476     {
2477       int i;
2478
2479       for (i = 0; i < cmp_status->length; i++)
2480         *charbuf++ = cmp_status->carryover[i];
2481       coding->annotated = 1;
2482     }
2483
2484   while (1)
2485     {
2486       int c, id;
2487
2488       src_base = src;
2489       consumed_chars_base = consumed_chars;
2490
2491       if (charbuf >= charbuf_end)
2492         {
2493           if (byte_after_cr >= 0)
2494             src_base--;
2495           break;
2496         }
2497
2498       if (byte_after_cr >= 0)
2499         c = byte_after_cr, byte_after_cr = -1;
2500       else
2501         ONE_MORE_BYTE (c);
2502
2503       if (c < 0 || c == 0x80)
2504         {
2505           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2506           if (c < 0)
2507             {
2508               *charbuf++ = -c;
2509               char_offset++;
2510             }
2511           else
2512             DECODE_EMACS_MULE_COMPOSITION_START ();
2513           continue;
2514         }
2515
2516       if (c < 0x80)
2517         {
2518           if (eol_crlf && c == '\r')
2519             ONE_MORE_BYTE (byte_after_cr);
2520           id = charset_ascii;
2521           if (cmp_status->state != COMPOSING_NO)
2522             {
2523               if (cmp_status->old_form)
2524                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2525               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2526                 cmp_status->ncomps--;
2527             }
2528         }
2529       else
2530         {
2531           int nchars, nbytes;
2532           /* emacs_mule_char can load a charset map from a file, which
2533              allocates a large structure and might cause buffer text
2534              to be relocated as result.  Thus, we need to remember the
2535              original pointer to buffer text, and fixup all related
2536              pointers after the call.  */
2537           const unsigned char *orig = coding->source;
2538           EMACS_INT offset;
2539
2540           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2541                                cmp_status);
2542           offset = coding->source - orig;
2543           if (offset)
2544             {
2545               src += offset;
2546               src_base += offset;
2547               src_end += offset;
2548             }
2549           if (c < 0)
2550             {
2551               if (c == -1)
2552                 goto invalid_code;
2553               if (c == -2)
2554                 break;
2555             }
2556           src = src_base + nbytes;
2557           consumed_chars = consumed_chars_base + nchars;
2558           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2559             cmp_status->ncomps -= nchars;
2560         }
2561
2562       /* Now if C >= 0, we found a normally encoded characer, if C <
2563          0, we found an old-style composition component character or
2564          rule.  */
2565
2566       if (cmp_status->state == COMPOSING_NO)
2567         {
2568           if (last_id != id)
2569             {
2570               if (last_id != charset_ascii)
2571                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2572                                   last_id);
2573               last_id = id;
2574               last_offset = char_offset;
2575             }
2576           *charbuf++ = c;
2577           char_offset++;
2578         }
2579       else if (cmp_status->state == COMPOSING_CHAR)
2580         {
2581           if (cmp_status->old_form)
2582             {
2583               if (c >= 0)
2584                 {
2585                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2586                   *charbuf++ = c;
2587                   char_offset++;
2588                 }
2589               else
2590                 {
2591                   *charbuf++ = -c;
2592                   cmp_status->nchars++;
2593                   cmp_status->length++;
2594                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2595                     EMACS_MULE_COMPOSITION_END ();
2596                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2597                     cmp_status->state = COMPOSING_RULE;
2598                 }
2599             }
2600           else
2601             {
2602               *charbuf++ = c;
2603               cmp_status->length++;
2604               cmp_status->nchars--;
2605               if (cmp_status->nchars == 0)
2606                 EMACS_MULE_COMPOSITION_END ();
2607             }
2608         }
2609       else if (cmp_status->state == COMPOSING_RULE)
2610         {
2611           int rule;
2612
2613           if (c >= 0)
2614             {
2615               EMACS_MULE_COMPOSITION_END ();
2616               *charbuf++ = c;
2617               char_offset++;
2618             }
2619           else
2620             {
2621               c = -c;
2622               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2623               if (rule < 0)
2624                 goto invalid_code;
2625               *charbuf++ = -2;
2626               *charbuf++ = rule;
2627               cmp_status->length += 2;
2628               cmp_status->state = COMPOSING_CHAR;
2629             }
2630         }
2631       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2632         {
2633           *charbuf++ = c;
2634           cmp_status->length++;
2635           if (cmp_status->ncomps == 0)
2636             cmp_status->state = COMPOSING_CHAR;
2637           else if (cmp_status->ncomps > 0)
2638             {
2639               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2640                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2641             }
2642           else
2643             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2644         }
2645       else                      /* COMPOSING_COMPONENT_RULE */
2646         {
2647           int rule;
2648
2649           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2650           if (rule < 0)
2651             goto invalid_code;
2652           *charbuf++ = -2;
2653           *charbuf++ = rule;
2654           cmp_status->length += 2;
2655           cmp_status->ncomps--;
2656           if (cmp_status->ncomps > 0)
2657             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2658           else
2659             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2660         }
2661       continue;
2662
2663     retry:
2664       src = src_base;
2665       consumed_chars = consumed_chars_base;
2666       continue;
2667
2668     invalid_code:
2669       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2670       src = src_base;
2671       consumed_chars = consumed_chars_base;
2672       ONE_MORE_BYTE (c);
2673       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2674       char_offset++;
2675       coding->errors++;
2676     }
2677
2678  no_more_source:
2679   if (cmp_status->state != COMPOSING_NO)
2680     {
2681       if (coding->mode & CODING_MODE_LAST_BLOCK)
2682         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2683       else
2684         {
2685           int i;
2686
2687           charbuf -= cmp_status->length;
2688           for (i = 0; i < cmp_status->length; i++)
2689             cmp_status->carryover[i] = charbuf[i];
2690         }
2691     }
2692   if (last_id != charset_ascii)
2693     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2694   coding->consumed_char += consumed_chars_base;
2695   coding->consumed = src_base - coding->source;
2696   coding->charbuf_used = charbuf - coding->charbuf;
2697 }
2698
2699
2700 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2701   do {                                          \
2702     if (id < 0xA0)                              \
2703       codes[0] = id, codes[1] = 0;              \
2704     else if (id < 0xE0)                         \
2705       codes[0] = 0x9A, codes[1] = id;           \
2706     else if (id < 0xF0)                         \
2707       codes[0] = 0x9B, codes[1] = id;           \
2708     else if (id < 0xF5)                         \
2709       codes[0] = 0x9C, codes[1] = id;           \
2710     else                                        \
2711       codes[0] = 0x9D, codes[1] = id;           \
2712   } while (0);
2713
2714
2715 static int
2716 encode_coding_emacs_mule (coding)
2717      struct coding_system *coding;
2718 {
2719   int multibytep = coding->dst_multibyte;
2720   int *charbuf = coding->charbuf;
2721   int *charbuf_end = charbuf + coding->charbuf_used;
2722   unsigned char *dst = coding->destination + coding->produced;
2723   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2724   int safe_room = 8;
2725   int produced_chars = 0;
2726   Lisp_Object attrs, charset_list;
2727   int c;
2728   int preferred_charset_id = -1;
2729
2730   CODING_GET_INFO (coding, attrs, charset_list);
2731   if (! EQ (charset_list, Vemacs_mule_charset_list))
2732     {
2733       CODING_ATTR_CHARSET_LIST (attrs)
2734         = charset_list = Vemacs_mule_charset_list;
2735     }
2736
2737   while (charbuf < charbuf_end)
2738     {
2739       ASSURE_DESTINATION (safe_room);
2740       c = *charbuf++;
2741
2742       if (c < 0)
2743         {
2744           /* Handle an annotation.  */
2745           switch (*charbuf)
2746             {
2747             case CODING_ANNOTATE_COMPOSITION_MASK:
2748               /* Not yet implemented.  */
2749               break;
2750             case CODING_ANNOTATE_CHARSET_MASK:
2751               preferred_charset_id = charbuf[3];
2752               if (preferred_charset_id >= 0
2753                   && NILP (Fmemq (make_number (preferred_charset_id),
2754                                   charset_list)))
2755                 preferred_charset_id = -1;
2756               break;
2757             default:
2758               abort ();
2759             }
2760           charbuf += -c - 1;
2761           continue;
2762         }
2763
2764       if (ASCII_CHAR_P (c))
2765         EMIT_ONE_ASCII_BYTE (c);
2766       else if (CHAR_BYTE8_P (c))
2767         {
2768           c = CHAR_TO_BYTE8 (c);
2769           EMIT_ONE_BYTE (c);
2770         }
2771       else
2772         {
2773           struct charset *charset;
2774           unsigned code;
2775           int dimension;
2776           int emacs_mule_id;
2777           unsigned char leading_codes[2];
2778
2779           if (preferred_charset_id >= 0)
2780             {
2781               charset = CHARSET_FROM_ID (preferred_charset_id);
2782               if (CHAR_CHARSET_P (c, charset))
2783                 code = ENCODE_CHAR (charset, c);
2784               else
2785                 charset = char_charset (c, charset_list, &code);
2786             }
2787           else
2788             charset = char_charset (c, charset_list, &code);
2789           if (! charset)
2790             {
2791               c = coding->default_char;
2792               if (ASCII_CHAR_P (c))
2793                 {
2794                   EMIT_ONE_ASCII_BYTE (c);
2795                   continue;
2796                 }
2797               charset = char_charset (c, charset_list, &code);
2798             }
2799           dimension = CHARSET_DIMENSION (charset);
2800           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2801           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2802           EMIT_ONE_BYTE (leading_codes[0]);
2803           if (leading_codes[1])
2804             EMIT_ONE_BYTE (leading_codes[1]);
2805           if (dimension == 1)
2806             EMIT_ONE_BYTE (code | 0x80);
2807           else
2808             {
2809               code |= 0x8080;
2810               EMIT_ONE_BYTE (code >> 8);
2811               EMIT_ONE_BYTE (code & 0xFF);
2812             }
2813         }
2814     }
2815   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2816   coding->produced_char += produced_chars;
2817   coding->produced = dst - coding->destination;
2818   return 0;
2819 }
2820
2821 \f
2822 /*** 7. ISO2022 handlers ***/
2823
2824 /* The following note describes the coding system ISO2022 briefly.
2825    Since the intention of this note is to help understand the
2826    functions in this file, some parts are NOT ACCURATE or are OVERLY
2827    SIMPLIFIED.  For thorough understanding, please refer to the
2828    original document of ISO2022.  This is equivalent to the standard
2829    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2830
2831    ISO2022 provides many mechanisms to encode several character sets
2832    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2833    is encoded using bytes less than 128.  This may make the encoded
2834    text a little bit longer, but the text passes more easily through
2835    several types of gateway, some of which strip off the MSB (Most
2836    Significant Bit).
2837
2838    There are two kinds of character sets: control character sets and
2839    graphic character sets.  The former contain control characters such
2840    as `newline' and `escape' to provide control functions (control
2841    functions are also provided by escape sequences).  The latter
2842    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2843    two control character sets and many graphic character sets.
2844
2845    Graphic character sets are classified into one of the following
2846    four classes, according to the number of bytes (DIMENSION) and
2847    number of characters in one dimension (CHARS) of the set:
2848    - DIMENSION1_CHARS94
2849    - DIMENSION1_CHARS96
2850    - DIMENSION2_CHARS94
2851    - DIMENSION2_CHARS96
2852
2853    In addition, each character set is assigned an identification tag,
2854    unique for each set, called the "final character" (denoted as <F>
2855    hereafter).  The <F> of each character set is decided by ECMA(*)
2856    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2857    (0x30..0x3F are for private use only).
2858
2859    Note (*): ECMA = European Computer Manufacturers Association
2860
2861    Here are examples of graphic character sets [NAME(<F>)]:
2862         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2863         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2864         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2865         o DIMENSION2_CHARS96 -- none for the moment
2866
2867    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2868         C0 [0x00..0x1F] -- control character plane 0
2869         GL [0x20..0x7F] -- graphic character plane 0
2870         C1 [0x80..0x9F] -- control character plane 1
2871         GR [0xA0..0xFF] -- graphic character plane 1
2872
2873    A control character set is directly designated and invoked to C0 or
2874    C1 by an escape sequence.  The most common case is that:
2875    - ISO646's  control character set is designated/invoked to C0, and
2876    - ISO6429's control character set is designated/invoked to C1,
2877    and usually these designations/invocations are omitted in encoded
2878    text.  In a 7-bit environment, only C0 can be used, and a control
2879    character for C1 is encoded by an appropriate escape sequence to
2880    fit into the environment.  All control characters for C1 are
2881    defined to have corresponding escape sequences.
2882
2883    A graphic character set is at first designated to one of four
2884    graphic registers (G0 through G3), then these graphic registers are
2885    invoked to GL or GR.  These designations and invocations can be
2886    done independently.  The most common case is that G0 is invoked to
2887    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2888    these invocations and designations are omitted in encoded text.
2889    In a 7-bit environment, only GL can be used.
2890
2891    When a graphic character set of CHARS94 is invoked to GL, codes
2892    0x20 and 0x7F of the GL area work as control characters SPACE and
2893    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2894    be used.
2895
2896    There are two ways of invocation: locking-shift and single-shift.
2897    With locking-shift, the invocation lasts until the next different
2898    invocation, whereas with single-shift, the invocation affects the
2899    following character only and doesn't affect the locking-shift
2900    state.  Invocations are done by the following control characters or
2901    escape sequences:
2902
2903    ----------------------------------------------------------------------
2904    abbrev  function                  cntrl escape seq   description
2905    ----------------------------------------------------------------------
2906    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2907    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2908    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2909    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2910    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2911    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2912    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2913    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2914    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2915    ----------------------------------------------------------------------
2916    (*) These are not used by any known coding system.
2917
2918    Control characters for these functions are defined by macros
2919    ISO_CODE_XXX in `coding.h'.
2920
2921    Designations are done by the following escape sequences:
2922    ----------------------------------------------------------------------
2923    escape sequence      description
2924    ----------------------------------------------------------------------
2925    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2926    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2927    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2928    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2929    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2930    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2931    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2932    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2933    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2934    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2935    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2936    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2937    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2938    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2939    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2940    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2941    ----------------------------------------------------------------------
2942
2943    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2944    of dimension 1, chars 94, and final character <F>, etc...
2945
2946    Note (*): Although these designations are not allowed in ISO2022,
2947    Emacs accepts them on decoding, and produces them on encoding
2948    CHARS96 character sets in a coding system which is characterized as
2949    7-bit environment, non-locking-shift, and non-single-shift.
2950
2951    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2952    '(' must be omitted.  We refer to this as "short-form" hereafter.
2953
2954    Now you may notice that there are a lot of ways of encoding the
2955    same multilingual text in ISO2022.  Actually, there exist many
2956    coding systems such as Compound Text (used in X11's inter client
2957    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2958    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2959    localized platforms), and all of these are variants of ISO2022.
2960
2961    In addition to the above, Emacs handles two more kinds of escape
2962    sequences: ISO6429's direction specification and Emacs' private
2963    sequence for specifying character composition.
2964
2965    ISO6429's direction specification takes the following form:
2966         o CSI ']'      -- end of the current direction
2967         o CSI '0' ']'  -- end of the current direction
2968         o CSI '1' ']'  -- start of left-to-right text
2969         o CSI '2' ']'  -- start of right-to-left text
2970    The control character CSI (0x9B: control sequence introducer) is
2971    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2972
2973    Character composition specification takes the following form:
2974         o ESC '0' -- start relative composition
2975         o ESC '1' -- end composition
2976         o ESC '2' -- start rule-base composition (*)
2977         o ESC '3' -- start relative composition with alternate chars  (**)
2978         o ESC '4' -- start rule-base composition with alternate chars  (**)
2979   Since these are not standard escape sequences of any ISO standard,
2980   the use of them with these meanings is restricted to Emacs only.
2981
2982   (*) This form is used only in Emacs 20.7 and older versions,
2983   but newer versions can safely decode it.
2984   (**) This form is used only in Emacs 21.1 and newer versions,
2985   and older versions can't decode it.
2986
2987   Here's a list of example usages of these composition escape
2988   sequences (categorized by `enum composition_method').
2989
2990   COMPOSITION_RELATIVE:
2991         ESC 0 CHAR [ CHAR ] ESC 1
2992   COMPOSITION_WITH_RULE:
2993         ESC 2 CHAR [ RULE CHAR ] ESC 1
2994   COMPOSITION_WITH_ALTCHARS:
2995         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2996   COMPOSITION_WITH_RULE_ALTCHARS:
2997         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2998
2999 enum iso_code_class_type iso_code_class[256];
3000
3001 #define SAFE_CHARSET_P(coding, id)      \
3002   ((id) <= (coding)->max_charset_id     \
3003    && (coding)->safe_charsets[id] != 255)
3004
3005
3006 #define SHIFT_OUT_OK(category)  \
3007   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
3008
3009 static void
3010 setup_iso_safe_charsets (attrs)
3011      Lisp_Object attrs;
3012 {
3013   Lisp_Object charset_list, safe_charsets;
3014   Lisp_Object request;
3015   Lisp_Object reg_usage;
3016   Lisp_Object tail;
3017   int reg94, reg96;
3018   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
3019   int max_charset_id;
3020
3021   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3022   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
3023       && ! EQ (charset_list, Viso_2022_charset_list))
3024     {
3025       CODING_ATTR_CHARSET_LIST (attrs)
3026         = charset_list = Viso_2022_charset_list;
3027       ASET (attrs, coding_attr_safe_charsets, Qnil);
3028     }
3029
3030   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3031     return;
3032
3033   max_charset_id = 0;
3034   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3035     {
3036       int id = XINT (XCAR (tail));
3037       if (max_charset_id < id)
3038         max_charset_id = id;
3039     }
3040
3041   safe_charsets = make_uninit_string (max_charset_id + 1);
3042   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3043   request = AREF (attrs, coding_attr_iso_request);
3044   reg_usage = AREF (attrs, coding_attr_iso_usage);
3045   reg94 = XINT (XCAR (reg_usage));
3046   reg96 = XINT (XCDR (reg_usage));
3047
3048   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3049     {
3050       Lisp_Object id;
3051       Lisp_Object reg;
3052       struct charset *charset;
3053
3054       id = XCAR (tail);
3055       charset = CHARSET_FROM_ID (XINT (id));
3056       reg = Fcdr (Fassq (id, request));
3057       if (! NILP (reg))
3058         SSET (safe_charsets, XINT (id), XINT (reg));
3059       else if (charset->iso_chars_96)
3060         {
3061           if (reg96 < 4)
3062             SSET (safe_charsets, XINT (id), reg96);
3063         }
3064       else
3065         {
3066           if (reg94 < 4)
3067             SSET (safe_charsets, XINT (id), reg94);
3068         }
3069     }
3070   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3071 }
3072
3073
3074 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3075    Check if a text is encoded in one of ISO-2022 based codig systems.
3076    If it is, return 1, else return 0.  */
3077
3078 static int
3079 detect_coding_iso_2022 (coding, detect_info)
3080      struct coding_system *coding;
3081      struct coding_detection_info *detect_info;
3082 {
3083   const unsigned char *src = coding->source, *src_base = src;
3084   const unsigned char *src_end = coding->source + coding->src_bytes;
3085   int multibytep = coding->src_multibyte;
3086   int single_shifting = 0;
3087   int id;
3088   int c, c1;
3089   int consumed_chars = 0;
3090   int i;
3091   int rejected = 0;
3092   int found = 0;
3093   int composition_count = -1;
3094
3095   detect_info->checked |= CATEGORY_MASK_ISO;
3096
3097   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3098     {
3099       struct coding_system *this = &(coding_categories[i]);
3100       Lisp_Object attrs, val;
3101
3102       if (this->id < 0)
3103         continue;
3104       attrs = CODING_ID_ATTRS (this->id);
3105       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3106           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3107         setup_iso_safe_charsets (attrs);
3108       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3109       this->max_charset_id = SCHARS (val) - 1;
3110       this->safe_charsets = SDATA (val);
3111     }
3112
3113   /* A coding system of this category is always ASCII compatible.  */
3114   src += coding->head_ascii;
3115
3116   while (rejected != CATEGORY_MASK_ISO)
3117     {
3118       src_base = src;
3119       ONE_MORE_BYTE (c);
3120       switch (c)
3121         {
3122         case ISO_CODE_ESC:
3123           if (inhibit_iso_escape_detection)
3124             break;
3125           single_shifting = 0;
3126           ONE_MORE_BYTE (c);
3127           if (c >= '(' && c <= '/')
3128             {
3129               /* Designation sequence for a charset of dimension 1.  */
3130               ONE_MORE_BYTE (c1);
3131               if (c1 < ' ' || c1 >= 0x80
3132                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3133                 /* Invalid designation sequence.  Just ignore.  */
3134                 break;
3135             }
3136           else if (c == '$')
3137             {
3138               /* Designation sequence for a charset of dimension 2.  */
3139               ONE_MORE_BYTE (c);
3140               if (c >= '@' && c <= 'B')
3141                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3142                 id = iso_charset_table[1][0][c];
3143               else if (c >= '(' && c <= '/')
3144                 {
3145                   ONE_MORE_BYTE (c1);
3146                   if (c1 < ' ' || c1 >= 0x80
3147                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3148                     /* Invalid designation sequence.  Just ignore.  */
3149                     break;
3150                 }
3151               else
3152                 /* Invalid designation sequence.  Just ignore it.  */
3153                 break;
3154             }
3155           else if (c == 'N' || c == 'O')
3156             {
3157               /* ESC <Fe> for SS2 or SS3.  */
3158               single_shifting = 1;
3159               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160               break;
3161             }
3162           else if (c == '1')
3163             {
3164               /* End of composition.  */
3165               if (composition_count < 0
3166                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3167                 /* Invalid */
3168                 break;
3169               composition_count = -1;
3170               found |= CATEGORY_MASK_ISO;
3171             }
3172           else if (c >= '0' && c <= '4')
3173             {
3174               /* ESC <Fp> for start/end composition.  */
3175               composition_count = 0;
3176               break;
3177             }
3178           else
3179             {
3180               /* Invalid escape sequence.  Just ignore it.  */
3181               break;
3182             }
3183
3184           /* We found a valid designation sequence for CHARSET.  */
3185           rejected |= CATEGORY_MASK_ISO_8BIT;
3186           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3187                               id))
3188             found |= CATEGORY_MASK_ISO_7;
3189           else
3190             rejected |= CATEGORY_MASK_ISO_7;
3191           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3192                               id))
3193             found |= CATEGORY_MASK_ISO_7_TIGHT;
3194           else
3195             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3196           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3197                               id))
3198             found |= CATEGORY_MASK_ISO_7_ELSE;
3199           else
3200             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3201           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3202                               id))
3203             found |= CATEGORY_MASK_ISO_8_ELSE;
3204           else
3205             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3206           break;
3207
3208         case ISO_CODE_SO:
3209         case ISO_CODE_SI:
3210           /* Locking shift out/in.  */
3211           if (inhibit_iso_escape_detection)
3212             break;
3213           single_shifting = 0;
3214           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3215           break;
3216
3217         case ISO_CODE_CSI:
3218           /* Control sequence introducer.  */
3219           single_shifting = 0;
3220           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3221           found |= CATEGORY_MASK_ISO_8_ELSE;
3222           goto check_extra_latin;
3223
3224         case ISO_CODE_SS2:
3225         case ISO_CODE_SS3:
3226           /* Single shift.   */
3227           if (inhibit_iso_escape_detection)
3228             break;
3229           single_shifting = 0;
3230           rejected |= CATEGORY_MASK_ISO_7BIT;
3231           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3232               & CODING_ISO_FLAG_SINGLE_SHIFT)
3233             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3234           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3235               & CODING_ISO_FLAG_SINGLE_SHIFT)
3236             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3237           if (single_shifting)
3238             break;
3239           goto check_extra_latin;
3240
3241         default:
3242           if (c < 0)
3243             continue;
3244           if (c < 0x80)
3245             {
3246               if (composition_count >= 0)
3247                 composition_count++;
3248               single_shifting = 0;
3249               break;
3250             }
3251           if (c >= 0xA0)
3252             {
3253               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3254               found |= CATEGORY_MASK_ISO_8_1;
3255               /* Check the length of succeeding codes of the range
3256                  0xA0..0FF.  If the byte length is even, we include
3257                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3258                  only when we are not single shifting.  */
3259               if (! single_shifting
3260                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3261                 {
3262                   int i = 1;
3263                   while (src < src_end)
3264                     {
3265                       src_base = src;
3266                       ONE_MORE_BYTE (c);
3267                       if (c < 0xA0)
3268                         {
3269                           src = src_base;
3270                           break;
3271                         }
3272                       i++;
3273                     }
3274
3275                   if (i & 1 && src < src_end)
3276                     {
3277                       rejected |= CATEGORY_MASK_ISO_8_2;
3278                       if (composition_count >= 0)
3279                         composition_count += i;
3280                     }
3281                   else
3282                     {
3283                       found |= CATEGORY_MASK_ISO_8_2;
3284                       if (composition_count >= 0)
3285                         composition_count += i / 2;
3286                     }
3287                 }
3288               break;
3289             }
3290         check_extra_latin:
3291           single_shifting = 0;
3292           if (! VECTORP (Vlatin_extra_code_table)
3293               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3294             {
3295               rejected = CATEGORY_MASK_ISO;
3296               break;
3297             }
3298           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3299               & CODING_ISO_FLAG_LATIN_EXTRA)
3300             found |= CATEGORY_MASK_ISO_8_1;
3301           else
3302             rejected |= CATEGORY_MASK_ISO_8_1;
3303           rejected |= CATEGORY_MASK_ISO_8_2;
3304         }
3305     }
3306   detect_info->rejected |= CATEGORY_MASK_ISO;
3307   return 0;
3308
3309  no_more_source:
3310   detect_info->rejected |= rejected;
3311   detect_info->found |= (found & ~rejected);
3312   return 1;
3313 }
3314
3315
3316 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3317    escape sequence should be kept.  */
3318 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3319   do {                                                                  \
3320     int id, prev;                                                       \
3321                                                                         \
3322     if (final < '0' || final >= 128                                     \
3323         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3324         || !SAFE_CHARSET_P (coding, id))                                \
3325       {                                                                 \
3326         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3327         chars_96 = -1;                                                  \
3328         break;                                                          \
3329       }                                                                 \
3330     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3331     if (id == charset_jisx0201_roman)                                   \
3332       {                                                                 \
3333         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3334           id = charset_ascii;                                           \
3335       }                                                                 \
3336     else if (id == charset_jisx0208_1978)                               \
3337       {                                                                 \
3338         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3339           id = charset_jisx0208;                                        \
3340       }                                                                 \
3341     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3342     /* If there was an invalid designation to REG previously, and this  \
3343        designation is ASCII to REG, we should keep this designation     \
3344        sequence.  */                                                    \
3345     if (prev == -2 && id == charset_ascii)                              \
3346       chars_96 = -1;                                                    \
3347   } while (0)
3348
3349
3350 /* Handle these composition sequence (ALT: alternate char):
3351
3352    (1) relative composition: ESC 0 CHAR ... ESC 1
3353    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3354    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3355    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3356
3357    When the start sequence (ESC 0/2/3/4) is found, this annotation
3358    header is produced.
3359
3360         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3361
3362    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3363    produced until the end sequence (ESC 1) is found:
3364
3365    (1) CHAR ... CHAR
3366    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3367    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3368    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3369
3370    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3371    annotation header is updated as below:
3372
3373    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3374    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3375    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3376    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3377
3378    If an error is found while composing, the annotation header is
3379    changed to:
3380
3381         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3382
3383    and the sequence [ -2 DECODED-RULE ] is changed to the original
3384    byte sequence as below:
3385         o the original byte sequence is B: [ B -1 ]
3386         o the original byte sequence is B1 B2: [ B1 B2 ]
3387    and the sequence [ -1 -1 ] is changed to the original byte
3388    sequence:
3389         [ ESC '0' ]
3390 */
3391
3392 /* Decode a composition rule C1 and maybe one more byte from the
3393    source, and set RULE to the encoded composition rule, NBYTES to the
3394    length of the composition rule.  If the rule is invalid, set RULE
3395    to some negative value.  */
3396
3397 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3398   do {                                                                  \
3399     rule = c1 - 32;                                                     \
3400     if (rule < 0)                                                       \
3401       break;                                                            \
3402     if (rule < 81)              /* old format (before ver.21) */        \
3403       {                                                                 \
3404         int gref = (rule) / 9;                                          \
3405         int nref = (rule) % 9;                                          \
3406         if (gref == 4) gref = 10;                                       \
3407         if (nref == 4) nref = 10;                                       \
3408         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3409         nbytes = 1;                                                     \
3410       }                                                                 \
3411     else                        /* new format (after ver.21) */         \
3412       {                                                                 \
3413         int c;                                                          \
3414                                                                         \
3415         ONE_MORE_BYTE (c);                                              \
3416         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3417         if (rule >= 0)                                                  \
3418           rule += 0x100;   /* to destinguish it from the old format */  \
3419         nbytes = 2;                                                     \
3420       }                                                                 \
3421   } while (0)
3422
3423 #define ENCODE_COMPOSITION_RULE(rule)                           \
3424   do {                                                          \
3425     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3426                                                                 \
3427     if (rule < 0x100)           /* old format */                \
3428       {                                                         \
3429         if (gref == 10) gref = 4;                               \
3430         if (nref == 10) nref = 4;                               \
3431         charbuf[idx] = 32 + gref * 9 + nref;                    \
3432         charbuf[idx + 1] = -1;                                  \
3433         new_chars++;                                            \
3434       }                                                         \
3435     else                                /* new format */        \
3436       {                                                         \
3437         charbuf[idx] = 32 + 81 + gref;                          \
3438         charbuf[idx + 1] = 32 + nref;                           \
3439         new_chars += 2;                                         \
3440       }                                                         \
3441   } while (0)
3442
3443 /* Finish the current composition as invalid.  */
3444
3445 static int finish_composition P_ ((int *, struct composition_status *));
3446
3447 static int
3448 finish_composition (charbuf, cmp_status)
3449      int *charbuf;
3450      struct composition_status *cmp_status;
3451 {
3452   int idx = - cmp_status->length;
3453   int new_chars;
3454
3455   /* Recover the original ESC sequence */
3456   charbuf[idx++] = ISO_CODE_ESC;
3457   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3458                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3459                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3460                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3461                     : '4');
3462   charbuf[idx++] = -2;
3463   charbuf[idx++] = 0;
3464   charbuf[idx++] = -1;
3465   new_chars = cmp_status->nchars;
3466   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3467     for (; idx < 0; idx++)
3468       {
3469         int elt = charbuf[idx];
3470
3471         if (elt == -2)
3472           {
3473             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3474             idx++;
3475           }
3476         else if (elt == -1)
3477           {
3478             charbuf[idx++] = ISO_CODE_ESC;
3479             charbuf[idx] = '0';
3480             new_chars += 2;
3481           }
3482       }
3483   cmp_status->state = COMPOSING_NO;
3484   return new_chars;
3485 }
3486
3487 /* If characers are under composition, finish the composition.  */
3488 #define MAYBE_FINISH_COMPOSITION()                              \
3489   do {                                                          \
3490     if (cmp_status->state != COMPOSING_NO)                      \
3491       char_offset += finish_composition (charbuf, cmp_status);  \
3492   } while (0)
3493
3494 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3495
3496    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3497    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3498    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3499    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3500
3501    Produce this annotation sequence now:
3502
3503    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3504 */
3505
3506 #define DECODE_COMPOSITION_START(c1)                                       \
3507   do {                                                                     \
3508     if (c1 == '0'                                                          \
3509         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3510              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3511             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3512                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3513       {                                                                    \
3514         *charbuf++ = -1;                                                   \
3515         *charbuf++= -1;                                                    \
3516         cmp_status->state = COMPOSING_CHAR;                                \
3517         cmp_status->length += 2;                                           \
3518       }                                                                    \
3519     else                                                                   \
3520       {                                                                    \
3521         MAYBE_FINISH_COMPOSITION ();                                       \
3522         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3523                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3524                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3525                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3526         cmp_status->state                                                  \
3527           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3528         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3529         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3530         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3531         coding->annotated = 1;                                             \
3532       }                                                                    \
3533   } while (0)
3534
3535
3536 /* Handle composition end sequence ESC 1.  */
3537
3538 #define DECODE_COMPOSITION_END()                                        \
3539   do {                                                                  \
3540     if (cmp_status->nchars == 0                                         \
3541         || ((cmp_status->state == COMPOSING_CHAR)                       \
3542             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3543       {                                                                 \
3544         MAYBE_FINISH_COMPOSITION ();                                    \
3545         goto invalid_code;                                              \
3546       }                                                                 \
3547     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3548       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3549     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3550       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3551     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3552     char_offset += cmp_status->nchars;                                  \
3553     cmp_status->state = COMPOSING_NO;                                   \
3554   } while (0)
3555
3556 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3557
3558 #define STORE_COMPOSITION_RULE(rule)    \
3559   do {                                  \
3560     *charbuf++ = -2;                    \
3561     *charbuf++ = rule;                  \
3562     cmp_status->length += 2;            \
3563     cmp_status->state--;                \
3564   } while (0)
3565
3566 /* Store a composed char or a component char C in charbuf, and update
3567    cmp_status.  */
3568
3569 #define STORE_COMPOSITION_CHAR(c)                                       \
3570   do {                                                                  \
3571     *charbuf++ = (c);                                                   \
3572     cmp_status->length++;                                               \
3573     if (cmp_status->state == COMPOSING_CHAR)                            \
3574       cmp_status->nchars++;                                             \
3575     else                                                                \
3576       cmp_status->ncomps++;                                             \
3577     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3578         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3579             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3580       cmp_status->state++;                                              \
3581   } while (0)
3582
3583
3584 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3585
3586 static void
3587 decode_coding_iso_2022 (coding)
3588      struct coding_system *coding;
3589 {
3590   const unsigned char *src = coding->source + coding->consumed;
3591   const unsigned char *src_end = coding->source + coding->src_bytes;
3592   const unsigned char *src_base;
3593   int *charbuf = coding->charbuf + coding->charbuf_used;
3594   /* We may produce two annocations (charset and composition) in one
3595      loop and one more charset annocation at the end.  */
3596   int *charbuf_end
3597     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3598   int consumed_chars = 0, consumed_chars_base;
3599   int multibytep = coding->src_multibyte;
3600   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3601   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3602   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3603   int charset_id_2, charset_id_3;
3604   struct charset *charset;
3605   int c;
3606   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3607   Lisp_Object attrs, charset_list;
3608   int char_offset = coding->produced_char;
3609   int last_offset = char_offset;
3610   int last_id = charset_ascii;
3611   int eol_crlf =
3612     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3613   int byte_after_cr = -1;
3614   int i;
3615
3616   CODING_GET_INFO (coding, attrs, charset_list);
3617   setup_iso_safe_charsets (attrs);
3618   /* Charset list may have been changed.  */
3619   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3620   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3621
3622   if (cmp_status->state != COMPOSING_NO)
3623     {
3624       for (i = 0; i < cmp_status->length; i++)
3625         *charbuf++ = cmp_status->carryover[i];
3626       coding->annotated = 1;
3627     }
3628
3629   while (1)
3630     {
3631       int c1, c2, c3;
3632
3633       src_base = src;
3634       consumed_chars_base = consumed_chars;
3635
3636       if (charbuf >= charbuf_end)
3637         {
3638           if (byte_after_cr >= 0)
3639             src_base--;
3640           break;
3641         }
3642
3643       if (byte_after_cr >= 0)
3644         c1 = byte_after_cr, byte_after_cr = -1;
3645       else
3646         ONE_MORE_BYTE (c1);
3647       if (c1 < 0)
3648         goto invalid_code;
3649
3650       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3651         {
3652           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3653           char_offset++;
3654           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3655           continue;
3656         }
3657
3658       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3659         {
3660           if (c1 == ISO_CODE_ESC)
3661             {
3662               if (src + 1 >= src_end)
3663                 goto no_more_source;
3664               *charbuf++ = ISO_CODE_ESC;
3665               char_offset++;
3666               if (src[0] == '%' && src[1] == '@')
3667                 {
3668                   src += 2;
3669                   consumed_chars += 2;
3670                   char_offset += 2;
3671                   /* We are sure charbuf can contain two more chars. */
3672                   *charbuf++ = '%';
3673                   *charbuf++ = '@';
3674                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3675                 }
3676             }
3677           else
3678             {
3679               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3680               char_offset++;
3681             }
3682           continue;
3683         }
3684
3685       if ((cmp_status->state == COMPOSING_RULE
3686            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3687           && c1 != ISO_CODE_ESC)
3688         {
3689           int rule, nbytes;
3690
3691           DECODE_COMPOSITION_RULE (rule, nbytes);
3692           if (rule < 0)
3693             goto invalid_code;
3694           STORE_COMPOSITION_RULE (rule);
3695           continue;
3696         }
3697
3698       /* We produce at most one character.  */
3699       switch (iso_code_class [c1])
3700         {
3701         case ISO_0x20_or_0x7F:
3702           if (charset_id_0 < 0
3703               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3704             /* This is SPACE or DEL.  */
3705             charset = CHARSET_FROM_ID (charset_ascii);
3706           else
3707             charset = CHARSET_FROM_ID (charset_id_0);
3708           break;
3709
3710         case ISO_graphic_plane_0:
3711           if (charset_id_0 < 0)
3712             charset = CHARSET_FROM_ID (charset_ascii);
3713           else
3714             charset = CHARSET_FROM_ID (charset_id_0);
3715           break;
3716
3717         case ISO_0xA0_or_0xFF:
3718           if (charset_id_1 < 0
3719               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3720               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3721             goto invalid_code;
3722           /* This is a graphic character, we fall down ... */
3723
3724         case ISO_graphic_plane_1:
3725           if (charset_id_1 < 0)
3726             goto invalid_code;
3727           charset = CHARSET_FROM_ID (charset_id_1);
3728           break;
3729
3730         case ISO_control_0:
3731           if (eol_crlf && c1 == '\r')
3732             ONE_MORE_BYTE (byte_after_cr);
3733           MAYBE_FINISH_COMPOSITION ();
3734           charset = CHARSET_FROM_ID (charset_ascii);
3735           break;
3736
3737         case ISO_control_1:
3738           goto invalid_code;
3739
3740         case ISO_shift_out:
3741           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3742               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3743             goto invalid_code;
3744           CODING_ISO_INVOCATION (coding, 0) = 1;
3745           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3746           continue;
3747
3748         case ISO_shift_in:
3749           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3750             goto invalid_code;
3751           CODING_ISO_INVOCATION (coding, 0) = 0;
3752           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3753           continue;
3754
3755         case ISO_single_shift_2_7:
3756           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3757             goto invalid_code;
3758         case ISO_single_shift_2:
3759           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3760             goto invalid_code;
3761           /* SS2 is handled as an escape sequence of ESC 'N' */
3762           c1 = 'N';
3763           goto label_escape_sequence;
3764
3765         case ISO_single_shift_3:
3766           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3767             goto invalid_code;
3768           /* SS2 is handled as an escape sequence of ESC 'O' */
3769           c1 = 'O';
3770           goto label_escape_sequence;
3771
3772         case ISO_control_sequence_introducer:
3773           /* CSI is handled as an escape sequence of ESC '[' ...  */
3774           c1 = '[';
3775           goto label_escape_sequence;
3776
3777         case ISO_escape:
3778           ONE_MORE_BYTE (c1);
3779         label_escape_sequence:
3780           /* Escape sequences handled here are invocation,
3781              designation, direction specification, and character
3782              composition specification.  */
3783           switch (c1)
3784             {
3785             case '&':           /* revision of following character set */
3786               ONE_MORE_BYTE (c1);
3787               if (!(c1 >= '@' && c1 <= '~'))
3788                 goto invalid_code;
3789               ONE_MORE_BYTE (c1);
3790               if (c1 != ISO_CODE_ESC)
3791                 goto invalid_code;
3792               ONE_MORE_BYTE (c1);
3793               goto label_escape_sequence;
3794
3795             case '$':           /* designation of 2-byte character set */
3796               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3797                 goto invalid_code;
3798               {
3799                 int reg, chars96;
3800
3801                 ONE_MORE_BYTE (c1);
3802                 if (c1 >= '@' && c1 <= 'B')
3803                   {     /* designation of JISX0208.1978, GB2312.1980,
3804                            or JISX0208.1980 */
3805                     reg = 0, chars96 = 0;
3806                   }
3807                 else if (c1 >= 0x28 && c1 <= 0x2B)
3808                   { /* designation of DIMENSION2_CHARS94 character set */
3809                     reg = c1 - 0x28, chars96 = 0;
3810                     ONE_MORE_BYTE (c1);
3811                   }
3812                 else if (c1 >= 0x2C && c1 <= 0x2F)
3813                   { /* designation of DIMENSION2_CHARS96 character set */
3814                     reg = c1 - 0x2C, chars96 = 1;
3815                     ONE_MORE_BYTE (c1);
3816                   }
3817                 else
3818                   goto invalid_code;
3819                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3820                 /* We must update these variables now.  */
3821                 if (reg == 0)
3822                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3823                 else if (reg == 1)
3824                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3825                 if (chars96 < 0)
3826                   goto invalid_code;
3827               }
3828               continue;
3829
3830             case 'n':           /* invocation of locking-shift-2 */
3831               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3832                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3833                 goto invalid_code;
3834               CODING_ISO_INVOCATION (coding, 0) = 2;
3835               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3836               continue;
3837
3838             case 'o':           /* invocation of locking-shift-3 */
3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3840                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3841                 goto invalid_code;
3842               CODING_ISO_INVOCATION (coding, 0) = 3;
3843               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3844               continue;
3845
3846             case 'N':           /* invocation of single-shift-2 */
3847               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3848                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3849                 goto invalid_code;
3850               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3851               if (charset_id_2 < 0)
3852                 charset = CHARSET_FROM_ID (charset_ascii);
3853               else
3854                 charset = CHARSET_FROM_ID (charset_id_2);
3855               ONE_MORE_BYTE (c1);
3856               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3857                 goto invalid_code;
3858               break;
3859
3860             case 'O':           /* invocation of single-shift-3 */
3861               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3862                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3863                 goto invalid_code;
3864               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3865               if (charset_id_3 < 0)
3866                 charset = CHARSET_FROM_ID (charset_ascii);
3867               else
3868                 charset = CHARSET_FROM_ID (charset_id_3);
3869               ONE_MORE_BYTE (c1);
3870               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3871                 goto invalid_code;
3872               break;
3873
3874             case '0': case '2': case '3': case '4': /* start composition */
3875               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3876                 goto invalid_code;
3877               if (last_id != charset_ascii)
3878                 {
3879                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3880                   last_id = charset_ascii;
3881                   last_offset = char_offset;
3882                 }
3883               DECODE_COMPOSITION_START (c1);
3884               continue;
3885
3886             case '1':           /* end composition */
3887               if (cmp_status->state == COMPOSING_NO)
3888                 goto invalid_code;
3889               DECODE_COMPOSITION_END ();
3890               continue;
3891
3892             case '[':           /* specification of direction */
3893               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3894                 goto invalid_code;
3895               /* For the moment, nested direction is not supported.
3896                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3897                  left-to-right, and nozero means right-to-left.  */
3898               ONE_MORE_BYTE (c1);
3899               switch (c1)
3900                 {
3901                 case ']':       /* end of the current direction */
3902                   coding->mode &= ~CODING_MODE_DIRECTION;
3903
3904                 case '0':       /* end of the current direction */
3905                 case '1':       /* start of left-to-right direction */
3906                   ONE_MORE_BYTE (c1);
3907                   if (c1 == ']')
3908                     coding->mode &= ~CODING_MODE_DIRECTION;
3909                   else
3910                     goto invalid_code;
3911                   break;
3912
3913                 case '2':       /* start of right-to-left direction */
3914                   ONE_MORE_BYTE (c1);
3915                   if (c1 == ']')
3916                     coding->mode |= CODING_MODE_DIRECTION;
3917                   else
3918                     goto invalid_code;
3919                   break;
3920
3921                 default:
3922                   goto invalid_code;
3923                 }
3924               continue;
3925
3926             case '%':
3927               ONE_MORE_BYTE (c1);
3928               if (c1 == '/')
3929                 {
3930                   /* CTEXT extended segment:
3931                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3932                      We keep these bytes as is for the moment.
3933                      They may be decoded by post-read-conversion.  */
3934                   int dim, M, L;
3935                   int size;
3936
3937                   ONE_MORE_BYTE (dim);
3938                   if (dim < 0 || dim > 4)
3939                     goto invalid_code;
3940                   ONE_MORE_BYTE (M);
3941                   if (M < 128)
3942                     goto invalid_code;
3943                   ONE_MORE_BYTE (L);
3944                   if (L < 128)
3945                     goto invalid_code;
3946                   size = ((M - 128) * 128) + (L - 128);
3947                   if (charbuf + 6 > charbuf_end)
3948                     goto break_loop;
3949                   *charbuf++ = ISO_CODE_ESC;
3950                   *charbuf++ = '%';
3951                   *charbuf++ = '/';
3952                   *charbuf++ = dim;
3953                   *charbuf++ = BYTE8_TO_CHAR (M);
3954                   *charbuf++ = BYTE8_TO_CHAR (L);
3955                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3956                 }
3957               else if (c1 == 'G')
3958                 {
3959                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3960                      ESC % G --UTF-8-BYTES-- ESC % @
3961                      We keep these bytes as is for the moment.
3962                      They may be decoded by post-read-conversion.  */
3963                   if (charbuf + 3 > charbuf_end)
3964                     goto break_loop;
3965                   *charbuf++ = ISO_CODE_ESC;
3966                   *charbuf++ = '%';
3967                   *charbuf++ = 'G';
3968                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3969                 }
3970               else
3971                 goto invalid_code;
3972               continue;
3973               break;
3974
3975             default:
3976               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3977                 goto invalid_code;
3978               {
3979                 int reg, chars96;
3980
3981                 if (c1 >= 0x28 && c1 <= 0x2B)
3982                   { /* designation of DIMENSION1_CHARS94 character set */
3983                     reg = c1 - 0x28, chars96 = 0;
3984                     ONE_MORE_BYTE (c1);
3985                   }
3986                 else if (c1 >= 0x2C && c1 <= 0x2F)
3987                   { /* designation of DIMENSION1_CHARS96 character set */
3988                     reg = c1 - 0x2C, chars96 = 1;
3989                     ONE_MORE_BYTE (c1);
3990                   }
3991                 else
3992                   goto invalid_code;
3993                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3994                 /* We must update these variables now.  */
3995                 if (reg == 0)
3996                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3997                 else if (reg == 1)
3998                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3999                 if (chars96 < 0)
4000                   goto invalid_code;
4001               }
4002               continue;
4003             }
4004         }
4005
4006       if (cmp_status->state == COMPOSING_NO
4007           && charset->id != charset_ascii
4008           && last_id != charset->id)
4009         {
4010           if (last_id != charset_ascii)
4011             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4012           last_id = charset->id;
4013           last_offset = char_offset;
4014         }
4015
4016       /* Now we know CHARSET and 1st position code C1 of a character.
4017          Produce a decoded character while getting 2nd and 3rd
4018          position codes C2, C3 if necessary.  */
4019       if (CHARSET_DIMENSION (charset) > 1)
4020         {
4021           ONE_MORE_BYTE (c2);
4022           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
4023               || ((c1 & 0x80) != (c2 & 0x80)))
4024             /* C2 is not in a valid range.  */
4025             goto invalid_code;
4026           if (CHARSET_DIMENSION (charset) == 2)
4027             c1 = (c1 << 8) | c2;
4028           else
4029             {
4030               ONE_MORE_BYTE (c3);
4031               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4032                   || ((c1 & 0x80) != (c3 & 0x80)))
4033                 /* C3 is not in a valid range.  */
4034                 goto invalid_code;
4035               c1 = (c1 << 16) | (c2 << 8) | c2;
4036             }
4037         }
4038       c1 &= 0x7F7F7F;
4039       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4040       if (c < 0)
4041         {
4042           MAYBE_FINISH_COMPOSITION ();
4043           for (; src_base < src; src_base++, char_offset++)
4044             {
4045               if (ASCII_BYTE_P (*src_base))
4046                 *charbuf++ = *src_base;
4047               else
4048                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4049             }
4050         }
4051       else if (cmp_status->state == COMPOSING_NO)
4052         {
4053           *charbuf++ = c;
4054           char_offset++;
4055         }
4056       else if ((cmp_status->state == COMPOSING_CHAR
4057                 ? cmp_status->nchars
4058                 : cmp_status->ncomps)
4059                >= MAX_COMPOSITION_COMPONENTS)
4060         {
4061           /* Too long composition.  */
4062           MAYBE_FINISH_COMPOSITION ();
4063           *charbuf++ = c;
4064           char_offset++;
4065         }
4066       else
4067         STORE_COMPOSITION_CHAR (c);
4068       continue;
4069
4070     invalid_code:
4071       MAYBE_FINISH_COMPOSITION ();
4072       src = src_base;
4073       consumed_chars = consumed_chars_base;
4074       ONE_MORE_BYTE (c);
4075       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4076       char_offset++;
4077       coding->errors++;
4078       continue;
4079
4080     break_loop:
4081       break;
4082     }
4083
4084  no_more_source:
4085   if (cmp_status->state != COMPOSING_NO)
4086     {
4087       if (coding->mode & CODING_MODE_LAST_BLOCK)
4088         MAYBE_FINISH_COMPOSITION ();
4089       else
4090         {
4091           charbuf -= cmp_status->length;
4092           for (i = 0; i < cmp_status->length; i++)
4093             cmp_status->carryover[i] = charbuf[i];
4094         }
4095     }
4096   else if (last_id != charset_ascii)
4097     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4098   coding->consumed_char += consumed_chars_base;
4099   coding->consumed = src_base - coding->source;
4100   coding->charbuf_used = charbuf - coding->charbuf;
4101 }
4102
4103
4104 /* ISO2022 encoding stuff.  */
4105
4106 /*
4107    It is not enough to say just "ISO2022" on encoding, we have to
4108    specify more details.  In Emacs, each coding system of ISO2022
4109    variant has the following specifications:
4110         1. Initial designation to G0 thru G3.
4111         2. Allows short-form designation?
4112         3. ASCII should be designated to G0 before control characters?
4113         4. ASCII should be designated to G0 at end of line?
4114         5. 7-bit environment or 8-bit environment?
4115         6. Use locking-shift?
4116         7. Use Single-shift?
4117    And the following two are only for Japanese:
4118         8. Use ASCII in place of JIS0201-1976-Roman?
4119         9. Use JISX0208-1983 in place of JISX0208-1978?
4120    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4121    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4122    details.
4123 */
4124
4125 /* Produce codes (escape sequence) for designating CHARSET to graphic
4126    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4127    '@', 'A', or 'B' and the coding system CODING allows, produce
4128    designation sequence of short-form.  */
4129
4130 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4131   do {                                                                  \
4132     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4133     char *intermediate_char_94 = "()*+";                                \
4134     char *intermediate_char_96 = ",-./";                                \
4135     int revision = -1;                                                  \
4136     int c;                                                              \
4137                                                                         \
4138     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4139       revision = CHARSET_ISO_REVISION (charset);                        \
4140                                                                         \
4141     if (revision >= 0)                                                  \
4142       {                                                                 \
4143         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4144         EMIT_ONE_BYTE ('@' + revision);                                 \
4145       }                                                                 \
4146     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4147     if (CHARSET_DIMENSION (charset) == 1)                               \
4148       {                                                                 \
4149         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4150           c = intermediate_char_94[reg];                                \
4151         else                                                            \
4152           c = intermediate_char_96[reg];                                \
4153         EMIT_ONE_ASCII_BYTE (c);                                        \
4154       }                                                                 \
4155     else                                                                \
4156       {                                                                 \
4157         EMIT_ONE_ASCII_BYTE ('$');                                      \
4158         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4159           {                                                             \
4160             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4161                 || reg != 0                                             \
4162                 || final_char < '@' || final_char > 'B')                \
4163               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4164           }                                                             \
4165         else                                                            \
4166           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4167       }                                                                 \
4168     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4169                                                                         \
4170     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4171   } while (0)
4172
4173
4174 /* The following two macros produce codes (control character or escape
4175    sequence) for ISO2022 single-shift functions (single-shift-2 and
4176    single-shift-3).  */
4177
4178 #define ENCODE_SINGLE_SHIFT_2                                           \
4179   do {                                                                  \
4180     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4181       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4182     else                                                                \
4183       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4184     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4185   } while (0)
4186
4187
4188 #define ENCODE_SINGLE_SHIFT_3                                           \
4189   do {                                                                  \
4190     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4191       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4192     else                                                                \
4193       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4194     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4195   } while (0)
4196
4197
4198 /* The following four macros produce codes (control character or
4199    escape sequence) for ISO2022 locking-shift functions (shift-in,
4200    shift-out, locking-shift-2, and locking-shift-3).  */
4201
4202 #define ENCODE_SHIFT_IN                                 \
4203   do {                                                  \
4204     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4205     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4206   } while (0)
4207
4208
4209 #define ENCODE_SHIFT_OUT                                \
4210   do {                                                  \
4211     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4212     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4213   } while (0)
4214
4215
4216 #define ENCODE_LOCKING_SHIFT_2                          \
4217   do {                                                  \
4218     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4219     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4220   } while (0)
4221
4222
4223 #define ENCODE_LOCKING_SHIFT_3                          \
4224   do {                                                  \
4225     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4226     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4227   } while (0)
4228
4229
4230 /* Produce codes for a DIMENSION1 character whose character set is
4231    CHARSET and whose position-code is C1.  Designation and invocation
4232    sequences are also produced in advance if necessary.  */
4233
4234 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4235   do {                                                                  \
4236     int id = CHARSET_ID (charset);                                      \
4237                                                                         \
4238     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4239         && id == charset_ascii)                                         \
4240       {                                                                 \
4241         id = charset_jisx0201_roman;                                    \
4242         charset = CHARSET_FROM_ID (id);                                 \
4243       }                                                                 \
4244                                                                         \
4245     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4246       {                                                                 \
4247         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4248           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4249         else                                                            \
4250           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4251         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4252         break;                                                          \
4253       }                                                                 \
4254     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4255       {                                                                 \
4256         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4257         break;                                                          \
4258       }                                                                 \
4259     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4260       {                                                                 \
4261         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4262         break;                                                          \
4263       }                                                                 \
4264     else                                                                \
4265       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4266          must invoke it, or, at first, designate it to some graphic     \
4267          register.  Then repeat the loop to actually produce the        \
4268          character.  */                                                 \
4269       dst = encode_invocation_designation (charset, coding, dst,        \
4270                                            &produced_chars);            \
4271   } while (1)
4272
4273
4274 /* Produce codes for a DIMENSION2 character whose character set is
4275    CHARSET and whose position-codes are C1 and C2.  Designation and
4276    invocation codes are also produced in advance if necessary.  */
4277
4278 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4279   do {                                                                  \
4280     int id = CHARSET_ID (charset);                                      \
4281                                                                         \
4282     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4283         && id == charset_jisx0208)                                      \
4284       {                                                                 \
4285         id = charset_jisx0208_1978;                                     \
4286         charset = CHARSET_FROM_ID (id);                                 \
4287       }                                                                 \
4288                                                                         \
4289     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4290       {                                                                 \
4291         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4292           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4293         else                                                            \
4294           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4295         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4296         break;                                                          \
4297       }                                                                 \
4298     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4299       {                                                                 \
4300         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4301         break;                                                          \
4302       }                                                                 \
4303     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4304       {                                                                 \
4305         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4306         break;                                                          \
4307       }                                                                 \
4308     else                                                                \
4309       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4310          must invoke it, or, at first, designate it to some graphic     \
4311          register.  Then repeat the loop to actually produce the        \
4312          character.  */                                                 \
4313       dst = encode_invocation_designation (charset, coding, dst,        \
4314                                            &produced_chars);            \
4315   } while (1)
4316
4317
4318 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4319   do {                                                                     \
4320     int code = ENCODE_CHAR ((charset),(c));                                \
4321                                                                            \
4322     if (CHARSET_DIMENSION (charset) == 1)                                  \
4323       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4324     else                                                                   \
4325       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4326   } while (0)
4327
4328
4329 /* Produce designation and invocation codes at a place pointed by DST
4330    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4331    Return new DST.  */
4332
4333 unsigned char *
4334 encode_invocation_designation (charset, coding, dst, p_nchars)
4335      struct charset *charset;
4336      struct coding_system *coding;
4337      unsigned char *dst;
4338      int *p_nchars;
4339 {
4340   int multibytep = coding->dst_multibyte;
4341   int produced_chars = *p_nchars;
4342   int reg;                      /* graphic register number */
4343   int id = CHARSET_ID (charset);
4344
4345   /* At first, check designations.  */
4346   for (reg = 0; reg < 4; reg++)
4347     if (id == CODING_ISO_DESIGNATION (coding, reg))
4348       break;
4349
4350   if (reg >= 4)
4351     {
4352       /* CHARSET is not yet designated to any graphic registers.  */
4353       /* At first check the requested designation.  */
4354       reg = CODING_ISO_REQUEST (coding, id);
4355       if (reg < 0)
4356         /* Since CHARSET requests no special designation, designate it
4357            to graphic register 0.  */
4358         reg = 0;
4359
4360       ENCODE_DESIGNATION (charset, reg, coding);
4361     }
4362
4363   if (CODING_ISO_INVOCATION (coding, 0) != reg
4364       && CODING_ISO_INVOCATION (coding, 1) != reg)
4365     {
4366       /* Since the graphic register REG is not invoked to any graphic
4367          planes, invoke it to graphic plane 0.  */
4368       switch (reg)
4369         {
4370         case 0:                 /* graphic register 0 */
4371           ENCODE_SHIFT_IN;
4372           break;
4373
4374         case 1:                 /* graphic register 1 */
4375           ENCODE_SHIFT_OUT;
4376           break;
4377
4378         case 2:                 /* graphic register 2 */
4379           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4380             ENCODE_SINGLE_SHIFT_2;
4381           else
4382             ENCODE_LOCKING_SHIFT_2;
4383           break;
4384
4385         case 3:                 /* graphic register 3 */
4386           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4387             ENCODE_SINGLE_SHIFT_3;
4388           else
4389             ENCODE_LOCKING_SHIFT_3;
4390           break;
4391         }
4392     }
4393
4394   *p_nchars = produced_chars;
4395   return dst;
4396 }
4397
4398 /* The following three macros produce codes for indicating direction
4399    of text.  */
4400 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4401   do {                                                                  \
4402     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4403       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4404     else                                                                \
4405       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4406   } while (0)
4407
4408
4409 #define ENCODE_DIRECTION_R2L()                  \
4410   do {                                          \
4411     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4412     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4413   } while (0)
4414
4415
4416 #define ENCODE_DIRECTION_L2R()                  \
4417   do {                                          \
4418     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4419     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4420   } while (0)
4421
4422
4423 /* Produce codes for designation and invocation to reset the graphic
4424    planes and registers to initial state.  */
4425 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4426   do {                                                                  \
4427     int reg;                                                            \
4428     struct charset *charset;                                            \
4429                                                                         \
4430     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4431       ENCODE_SHIFT_IN;                                                  \
4432     for (reg = 0; reg < 4; reg++)                                       \
4433       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4434           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4435               != CODING_ISO_INITIAL (coding, reg)))                     \
4436         {                                                               \
4437           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4438           ENCODE_DESIGNATION (charset, reg, coding);                    \
4439         }                                                               \
4440   } while (0)
4441
4442
4443 /* Produce designation sequences of charsets in the line started from
4444    SRC to a place pointed by DST, and return updated DST.
4445
4446    If the current block ends before any end-of-line, we may fail to
4447    find all the necessary designations.  */
4448
4449 static unsigned char *
4450 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4451      struct coding_system *coding;
4452      int *charbuf, *charbuf_end;
4453      unsigned char *dst;
4454 {
4455   struct charset *charset;
4456   /* Table of charsets to be designated to each graphic register.  */
4457   int r[4];
4458   int c, found = 0, reg;
4459   int produced_chars = 0;
4460   int multibytep = coding->dst_multibyte;
4461   Lisp_Object attrs;
4462   Lisp_Object charset_list;
4463
4464   attrs = CODING_ID_ATTRS (coding->id);
4465   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4466   if (EQ (charset_list, Qiso_2022))
4467     charset_list = Viso_2022_charset_list;
4468
4469   for (reg = 0; reg < 4; reg++)
4470     r[reg] = -1;
4471
4472   while (found < 4)
4473     {
4474       int id;
4475
4476       c = *charbuf++;
4477       if (c == '\n')
4478         break;
4479       charset = char_charset (c, charset_list, NULL);
4480       id = CHARSET_ID (charset);
4481       reg = CODING_ISO_REQUEST (coding, id);
4482       if (reg >= 0 && r[reg] < 0)
4483         {
4484           found++;
4485           r[reg] = id;
4486         }
4487     }
4488
4489   if (found)
4490     {
4491       for (reg = 0; reg < 4; reg++)
4492         if (r[reg] >= 0
4493             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4494           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4495     }
4496
4497   return dst;
4498 }
4499
4500 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4501
4502 static int
4503 encode_coding_iso_2022 (coding)
4504      struct coding_system *coding;
4505 {
4506   int multibytep = coding->dst_multibyte;
4507   int *charbuf = coding->charbuf;
4508   int *charbuf_end = charbuf + coding->charbuf_used;
4509   unsigned char *dst = coding->destination + coding->produced;
4510   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4511   int safe_room = 16;
4512   int bol_designation
4513     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4514        && CODING_ISO_BOL (coding));
4515   int produced_chars = 0;
4516   Lisp_Object attrs, eol_type, charset_list;
4517   int ascii_compatible;
4518   int c;
4519   int preferred_charset_id = -1;
4520
4521   CODING_GET_INFO (coding, attrs, charset_list);
4522   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4523   if (VECTORP (eol_type))
4524     eol_type = Qunix;
4525
4526   setup_iso_safe_charsets (attrs);
4527   /* Charset list may have been changed.  */
4528   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4529   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4530
4531   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4532
4533   while (charbuf < charbuf_end)
4534     {
4535       ASSURE_DESTINATION (safe_room);
4536
4537       if (bol_designation)
4538         {
4539           unsigned char *dst_prev = dst;
4540
4541           /* We have to produce designation sequences if any now.  */
4542           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4543           bol_designation = 0;
4544           /* We are sure that designation sequences are all ASCII bytes.  */
4545           produced_chars += dst - dst_prev;
4546         }
4547
4548       c = *charbuf++;
4549
4550       if (c < 0)
4551         {
4552           /* Handle an annotation.  */
4553           switch (*charbuf)
4554             {
4555             case CODING_ANNOTATE_COMPOSITION_MASK:
4556               /* Not yet implemented.  */
4557               break;
4558             case CODING_ANNOTATE_CHARSET_MASK:
4559               preferred_charset_id = charbuf[2];
4560               if (preferred_charset_id >= 0
4561                   && NILP (Fmemq (make_number (preferred_charset_id),
4562                                   charset_list)))
4563                 preferred_charset_id = -1;
4564               break;
4565             default:
4566               abort ();
4567             }
4568           charbuf += -c - 1;
4569           continue;
4570         }
4571
4572       /* Now encode the character C.  */
4573       if (c < 0x20 || c == 0x7F)
4574         {
4575           if (c == '\n'
4576               || (c == '\r' && EQ (eol_type, Qmac)))
4577             {
4578               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4579                 ENCODE_RESET_PLANE_AND_REGISTER ();
4580               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4581                 {
4582                   int i;
4583
4584                   for (i = 0; i < 4; i++)
4585                     CODING_ISO_DESIGNATION (coding, i)
4586                       = CODING_ISO_INITIAL (coding, i);
4587                 }
4588               bol_designation
4589                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4590             }
4591           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4592             ENCODE_RESET_PLANE_AND_REGISTER ();
4593           EMIT_ONE_ASCII_BYTE (c);
4594         }
4595       else if (ASCII_CHAR_P (c))
4596         {
4597           if (ascii_compatible)
4598             EMIT_ONE_ASCII_BYTE (c);
4599           else
4600             {
4601               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4602               ENCODE_ISO_CHARACTER (charset, c);
4603             }
4604         }
4605       else if (CHAR_BYTE8_P (c))
4606         {
4607           c = CHAR_TO_BYTE8 (c);
4608           EMIT_ONE_BYTE (c);
4609         }
4610       else
4611         {
4612           struct charset *charset;
4613
4614           if (preferred_charset_id >= 0)
4615             {
4616               charset = CHARSET_FROM_ID (preferred_charset_id);
4617               if (! CHAR_CHARSET_P (c, charset))
4618                 charset = char_charset (c, charset_list, NULL);
4619             }
4620           else
4621             charset = char_charset (c, charset_list, NULL);
4622           if (!charset)
4623             {
4624               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4625                 {
4626                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4627                   charset = CHARSET_FROM_ID (charset_ascii);
4628                 }
4629               else
4630                 {
4631                   c = coding->default_char;
4632                   charset = char_charset (c, charset_list, NULL);
4633                 }
4634             }
4635           ENCODE_ISO_CHARACTER (charset, c);
4636         }
4637     }
4638
4639   if (coding->mode & CODING_MODE_LAST_BLOCK
4640       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4641     {
4642       ASSURE_DESTINATION (safe_room);
4643       ENCODE_RESET_PLANE_AND_REGISTER ();
4644     }
4645   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4646   CODING_ISO_BOL (coding) = bol_designation;
4647   coding->produced_char += produced_chars;
4648   coding->produced = dst - coding->destination;
4649   return 0;
4650 }
4651
4652 \f
4653 /*** 8,9. SJIS and BIG5 handlers ***/
4654
4655 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4656    quite widely.  So, for the moment, Emacs supports them in the bare
4657    C code.  But, in the future, they may be supported only by CCL.  */
4658
4659 /* SJIS is a coding system encoding three character sets: ASCII, right
4660    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4661    as is.  A character of charset katakana-jisx0201 is encoded by
4662    "position-code + 0x80".  A character of charset japanese-jisx0208
4663    is encoded in 2-byte but two position-codes are divided and shifted
4664    so that it fit in the range below.
4665
4666    --- CODE RANGE of SJIS ---
4667    (character set)      (range)
4668    ASCII                0x00 .. 0x7F
4669    KATAKANA-JISX0201    0xA0 .. 0xDF
4670    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4671             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4672    -------------------------------
4673
4674 */
4675
4676 /* BIG5 is a coding system encoding two character sets: ASCII and
4677    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4678    character set and is encoded in two-byte.
4679
4680    --- CODE RANGE of BIG5 ---
4681    (character set)      (range)
4682    ASCII                0x00 .. 0x7F
4683    Big5 (1st byte)      0xA1 .. 0xFE
4684         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4685    --------------------------
4686
4687   */
4688
4689 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4690    Check if a text is encoded in SJIS.  If it is, return
4691    CATEGORY_MASK_SJIS, else return 0.  */
4692
4693 static int
4694 detect_coding_sjis (coding, detect_info)
4695      struct coding_system *coding;
4696      struct coding_detection_info *detect_info;
4697 {
4698   const unsigned char *src = coding->source, *src_base;
4699   const unsigned char *src_end = coding->source + coding->src_bytes;
4700   int multibytep = coding->src_multibyte;
4701   int consumed_chars = 0;
4702   int found = 0;
4703   int c;
4704   Lisp_Object attrs, charset_list;
4705   int max_first_byte_of_2_byte_code;
4706
4707   CODING_GET_INFO (coding, attrs, charset_list);
4708   max_first_byte_of_2_byte_code
4709     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4710
4711   detect_info->checked |= CATEGORY_MASK_SJIS;
4712   /* A coding system of this category is always ASCII compatible.  */
4713   src += coding->head_ascii;
4714
4715   while (1)
4716     {
4717       src_base = src;
4718       ONE_MORE_BYTE (c);
4719       if (c < 0x80)
4720         continue;
4721       if ((c >= 0x81 && c <= 0x9F)
4722           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4723         {
4724           ONE_MORE_BYTE (c);
4725           if (c < 0x40 || c == 0x7F || c > 0xFC)
4726             break;
4727           found = CATEGORY_MASK_SJIS;
4728         }
4729       else if (c >= 0xA0 && c < 0xE0)
4730         found = CATEGORY_MASK_SJIS;
4731       else
4732         break;
4733     }
4734   detect_info->rejected |= CATEGORY_MASK_SJIS;
4735   return 0;
4736
4737  no_more_source:
4738   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4739     {
4740       detect_info->rejected |= CATEGORY_MASK_SJIS;
4741       return 0;
4742     }
4743   detect_info->found |= found;
4744   return 1;
4745 }
4746
4747 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4748    Check if a text is encoded in BIG5.  If it is, return
4749    CATEGORY_MASK_BIG5, else return 0.  */
4750
4751 static int
4752 detect_coding_big5 (coding, detect_info)
4753      struct coding_system *coding;
4754      struct coding_detection_info *detect_info;
4755 {
4756   const unsigned char *src = coding->source, *src_base;
4757   const unsigned char *src_end = coding->source + coding->src_bytes;
4758   int multibytep = coding->src_multibyte;
4759   int consumed_chars = 0;
4760   int found = 0;
4761   int c;
4762
4763   detect_info->checked |= CATEGORY_MASK_BIG5;
4764   /* A coding system of this category is always ASCII compatible.  */
4765   src += coding->head_ascii;
4766
4767   while (1)
4768     {
4769       src_base = src;
4770       ONE_MORE_BYTE (c);
4771       if (c < 0x80)
4772         continue;
4773       if (c >= 0xA1)
4774         {
4775           ONE_MORE_BYTE (c);
4776           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4777             return 0;
4778           found = CATEGORY_MASK_BIG5;
4779         }
4780       else
4781         break;
4782     }
4783   detect_info->rejected |= CATEGORY_MASK_BIG5;
4784   return 0;
4785
4786  no_more_source:
4787   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4788     {
4789       detect_info->rejected |= CATEGORY_MASK_BIG5;
4790       return 0;
4791     }
4792   detect_info->found |= found;
4793   return 1;
4794 }
4795
4796 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4797    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4798
4799 static void
4800 decode_coding_sjis (coding)
4801      struct coding_system *coding;
4802 {
4803   const unsigned char *src = coding->source + coding->consumed;
4804   const unsigned char *src_end = coding->source + coding->src_bytes;
4805   const unsigned char *src_base;
4806   int *charbuf = coding->charbuf + coding->charbuf_used;
4807   /* We may produce one charset annocation in one loop and one more at
4808      the end.  */
4809   int *charbuf_end
4810     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4811   int consumed_chars = 0, consumed_chars_base;
4812   int multibytep = coding->src_multibyte;
4813   struct charset *charset_roman, *charset_kanji, *charset_kana;
4814   struct charset *charset_kanji2;
4815   Lisp_Object attrs, charset_list, val;
4816   int char_offset = coding->produced_char;
4817   int last_offset = char_offset;
4818   int last_id = charset_ascii;
4819   int eol_crlf =
4820     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4821   int byte_after_cr = -1;
4822
4823   CODING_GET_INFO (coding, attrs, charset_list);
4824
4825   val = charset_list;
4826   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4827   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4828   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4829   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4830
4831   while (1)
4832     {
4833       int c, c1;
4834       struct charset *charset;
4835
4836       src_base = src;
4837       consumed_chars_base = consumed_chars;
4838
4839       if (charbuf >= charbuf_end)
4840         {
4841           if (byte_after_cr >= 0)
4842             src_base--;
4843           break;
4844         }
4845
4846       if (byte_after_cr >= 0)
4847         c = byte_after_cr, byte_after_cr = -1;
4848       else
4849         ONE_MORE_BYTE (c);
4850       if (c < 0)
4851         goto invalid_code;
4852       if (c < 0x80)
4853         {
4854           if (eol_crlf && c == '\r')
4855             ONE_MORE_BYTE (byte_after_cr);
4856           charset = charset_roman;
4857         }
4858       else if (c == 0x80 || c == 0xA0)
4859         goto invalid_code;
4860       else if (c >= 0xA1 && c <= 0xDF)
4861         {
4862           /* SJIS -> JISX0201-Kana */
4863           c &= 0x7F;
4864           charset = charset_kana;
4865         }
4866       else if (c <= 0xEF)
4867         {
4868           /* SJIS -> JISX0208 */
4869           ONE_MORE_BYTE (c1);
4870           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4871             goto invalid_code;
4872           c = (c << 8) | c1;
4873           SJIS_TO_JIS (c);
4874           charset = charset_kanji;
4875         }
4876       else if (c <= 0xFC && charset_kanji2)
4877         {
4878           /* SJIS -> JISX0213-2 */
4879           ONE_MORE_BYTE (c1);
4880           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4881             goto invalid_code;
4882           c = (c << 8) | c1;
4883           SJIS_TO_JIS2 (c);
4884           charset = charset_kanji2;
4885         }
4886       else
4887         goto invalid_code;
4888       if (charset->id != charset_ascii
4889           && last_id != charset->id)
4890         {
4891           if (last_id != charset_ascii)
4892             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4893           last_id = charset->id;
4894           last_offset = char_offset;
4895         }
4896       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4897       *charbuf++ = c;
4898       char_offset++;
4899       continue;
4900
4901     invalid_code:
4902       src = src_base;
4903       consumed_chars = consumed_chars_base;
4904       ONE_MORE_BYTE (c);
4905       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4906       char_offset++;
4907       coding->errors++;
4908     }
4909
4910  no_more_source:
4911   if (last_id != charset_ascii)
4912     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4913   coding->consumed_char += consumed_chars_base;
4914   coding->consumed = src_base - coding->source;
4915   coding->charbuf_used = charbuf - coding->charbuf;
4916 }
4917
4918 static void
4919 decode_coding_big5 (coding)
4920      struct coding_system *coding;
4921 {
4922   const unsigned char *src = coding->source + coding->consumed;
4923   const unsigned char *src_end = coding->source + coding->src_bytes;
4924   const unsigned char *src_base;
4925   int *charbuf = coding->charbuf + coding->charbuf_used;
4926   /* We may produce one charset annocation in one loop and one more at
4927      the end.  */
4928   int *charbuf_end
4929     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4930   int consumed_chars = 0, consumed_chars_base;
4931   int multibytep = coding->src_multibyte;
4932   struct charset *charset_roman, *charset_big5;
4933   Lisp_Object attrs, charset_list, val;
4934   int char_offset = coding->produced_char;
4935   int last_offset = char_offset;
4936   int last_id = charset_ascii;
4937   int eol_crlf =
4938     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4939   int byte_after_cr = -1;
4940
4941   CODING_GET_INFO (coding, attrs, charset_list);
4942   val = charset_list;
4943   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4944   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4945
4946   while (1)
4947     {
4948       int c, c1;
4949       struct charset *charset;
4950
4951       src_base = src;
4952       consumed_chars_base = consumed_chars;
4953
4954       if (charbuf >= charbuf_end)
4955         {
4956           if (byte_after_cr >= 0)
4957             src_base--;
4958           break;
4959         }
4960
4961       if (byte_after_cr >= 0)
4962         c = byte_after_cr, byte_after_cr = -1;
4963       else
4964         ONE_MORE_BYTE (c);
4965
4966       if (c < 0)
4967         goto invalid_code;
4968       if (c < 0x80)
4969         {
4970           if (eol_crlf && c == '\r')
4971             ONE_MORE_BYTE (byte_after_cr);
4972           charset = charset_roman;
4973         }
4974       else
4975         {
4976           /* BIG5 -> Big5 */
4977           if (c < 0xA1 || c > 0xFE)
4978             goto invalid_code;
4979           ONE_MORE_BYTE (c1);
4980           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4981             goto invalid_code;
4982           c = c << 8 | c1;
4983           charset = charset_big5;
4984         }
4985       if (charset->id != charset_ascii
4986           && last_id != charset->id)
4987         {
4988           if (last_id != charset_ascii)
4989             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4990           last_id = charset->id;
4991           last_offset = char_offset;
4992         }
4993       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4994       *charbuf++ = c;
4995       char_offset++;
4996       continue;
4997
4998     invalid_code:
4999       src = src_base;
5000       consumed_chars = consumed_chars_base;
5001       ONE_MORE_BYTE (c);
5002       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
5003       char_offset++;
5004       coding->errors++;
5005     }
5006
5007  no_more_source:
5008   if (last_id != charset_ascii)
5009     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5010   coding->consumed_char += consumed_chars_base;
5011   coding->consumed = src_base - coding->source;
5012   coding->charbuf_used = charbuf - coding->charbuf;
5013 }
5014
5015 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
5016    This function can encode charsets `ascii', `katakana-jisx0201',
5017    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
5018    are sure that all these charsets are registered as official charset
5019    (i.e. do not have extended leading-codes).  Characters of other
5020    charsets are produced without any encoding.  If SJIS_P is 1, encode
5021    SJIS text, else encode BIG5 text.  */
5022
5023 static int
5024 encode_coding_sjis (coding)
5025      struct coding_system *coding;
5026 {
5027   int multibytep = coding->dst_multibyte;
5028   int *charbuf = coding->charbuf;
5029   int *charbuf_end = charbuf + coding->charbuf_used;
5030   unsigned char *dst = coding->destination + coding->produced;
5031   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5032   int safe_room = 4;
5033   int produced_chars = 0;
5034   Lisp_Object attrs, charset_list, val;
5035   int ascii_compatible;
5036   struct charset *charset_roman, *charset_kanji, *charset_kana;
5037   struct charset *charset_kanji2;
5038   int c;
5039
5040   CODING_GET_INFO (coding, attrs, charset_list);
5041   val = charset_list;
5042   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5043   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5044   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5045   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5046
5047   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5048
5049   while (charbuf < charbuf_end)
5050     {
5051       ASSURE_DESTINATION (safe_room);
5052       c = *charbuf++;
5053       /* Now encode the character C.  */
5054       if (ASCII_CHAR_P (c) && ascii_compatible)
5055         EMIT_ONE_ASCII_BYTE (c);
5056       else if (CHAR_BYTE8_P (c))
5057         {
5058           c = CHAR_TO_BYTE8 (c);
5059           EMIT_ONE_BYTE (c);
5060         }
5061       else
5062         {
5063           unsigned code;
5064           struct charset *charset = char_charset (c, charset_list, &code);
5065
5066           if (!charset)
5067             {
5068               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5069                 {
5070                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5071                   charset = CHARSET_FROM_ID (charset_ascii);
5072                 }
5073               else
5074                 {
5075                   c = coding->default_char;
5076                   charset = char_charset (c, charset_list, &code);
5077                 }
5078             }
5079           if (code == CHARSET_INVALID_CODE (charset))
5080             abort ();
5081           if (charset == charset_kanji)
5082             {
5083               int c1, c2;
5084               JIS_TO_SJIS (code);
5085               c1 = code >> 8, c2 = code & 0xFF;
5086               EMIT_TWO_BYTES (c1, c2);
5087             }
5088           else if (charset == charset_kana)
5089             EMIT_ONE_BYTE (code | 0x80);
5090           else if (charset_kanji2 && charset == charset_kanji2)
5091             {
5092               int c1, c2;
5093
5094               c1 = code >> 8;
5095               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5096                   || c1 == 0x28
5097                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5098                 {
5099                   JIS_TO_SJIS2 (code);
5100                   c1 = code >> 8, c2 = code & 0xFF;
5101                   EMIT_TWO_BYTES (c1, c2);
5102                 }
5103               else
5104                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5105             }
5106           else
5107             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5108         }
5109     }
5110   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5111   coding->produced_char += produced_chars;
5112   coding->produced = dst - coding->destination;
5113   return 0;
5114 }
5115
5116 static int
5117 encode_coding_big5 (coding)
5118      struct coding_system *coding;
5119 {
5120   int multibytep = coding->dst_multibyte;
5121   int *charbuf = coding->charbuf;
5122   int *charbuf_end = charbuf + coding->charbuf_used;
5123   unsigned char *dst = coding->destination + coding->produced;
5124   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5125   int safe_room = 4;
5126   int produced_chars = 0;
5127   Lisp_Object attrs, charset_list, val;
5128   int ascii_compatible;
5129   struct charset *charset_roman, *charset_big5;
5130   int c;
5131
5132   CODING_GET_INFO (coding, attrs, charset_list);
5133   val = charset_list;
5134   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5135   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5136   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5137
5138   while (charbuf < charbuf_end)
5139     {
5140       ASSURE_DESTINATION (safe_room);
5141       c = *charbuf++;
5142       /* Now encode the character C.  */
5143       if (ASCII_CHAR_P (c) && ascii_compatible)
5144         EMIT_ONE_ASCII_BYTE (c);
5145       else if (CHAR_BYTE8_P (c))
5146         {
5147           c = CHAR_TO_BYTE8 (c);
5148           EMIT_ONE_BYTE (c);
5149         }
5150       else
5151         {
5152           unsigned code;
5153           struct charset *charset = char_charset (c, charset_list, &code);
5154
5155           if (! charset)
5156             {
5157               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5158                 {
5159                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5160                   charset = CHARSET_FROM_ID (charset_ascii);
5161                 }
5162               else
5163                 {
5164                   c = coding->default_char;
5165                   charset = char_charset (c, charset_list, &code);
5166                 }
5167             }
5168           if (code == CHARSET_INVALID_CODE (charset))
5169             abort ();
5170           if (charset == charset_big5)
5171             {
5172               int c1, c2;
5173
5174               c1 = code >> 8, c2 = code & 0xFF;
5175               EMIT_TWO_BYTES (c1, c2);
5176             }
5177           else
5178             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5179         }
5180     }
5181   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5182   coding->produced_char += produced_chars;
5183   coding->produced = dst - coding->destination;
5184   return 0;
5185 }
5186
5187 \f
5188 /*** 10. CCL handlers ***/
5189
5190 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5191    Check if a text is encoded in a coding system of which
5192    encoder/decoder are written in CCL program.  If it is, return
5193    CATEGORY_MASK_CCL, else return 0.  */
5194
5195 static int
5196 detect_coding_ccl (coding, detect_info)
5197      struct coding_system *coding;
5198      struct coding_detection_info *detect_info;
5199 {
5200   const unsigned char *src = coding->source, *src_base;
5201   const unsigned char *src_end = coding->source + coding->src_bytes;
5202   int multibytep = coding->src_multibyte;
5203   int consumed_chars = 0;
5204   int found = 0;
5205   unsigned char *valids;
5206   int head_ascii = coding->head_ascii;
5207   Lisp_Object attrs;
5208
5209   detect_info->checked |= CATEGORY_MASK_CCL;
5210
5211   coding = &coding_categories[coding_category_ccl];
5212   valids = CODING_CCL_VALIDS (coding);
5213   attrs = CODING_ID_ATTRS (coding->id);
5214   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5215     src += head_ascii;
5216
5217   while (1)
5218     {
5219       int c;
5220
5221       src_base = src;
5222       ONE_MORE_BYTE (c);
5223       if (c < 0 || ! valids[c])
5224         break;
5225       if ((valids[c] > 1))
5226         found = CATEGORY_MASK_CCL;
5227     }
5228   detect_info->rejected |= CATEGORY_MASK_CCL;
5229   return 0;
5230
5231  no_more_source:
5232   detect_info->found |= found;
5233   return 1;
5234 }
5235
5236 static void
5237 decode_coding_ccl (coding)
5238      struct coding_system *coding;
5239 {
5240   const unsigned char *src = coding->source + coding->consumed;
5241   const unsigned char *src_end = coding->source + coding->src_bytes;
5242   int *charbuf = coding->charbuf + coding->charbuf_used;
5243   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5244   int consumed_chars = 0;
5245   int multibytep = coding->src_multibyte;
5246   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5247   int source_charbuf[1024];
5248   int source_byteidx[1025];
5249   Lisp_Object attrs, charset_list;
5250
5251   CODING_GET_INFO (coding, attrs, charset_list);
5252
5253   while (1)
5254     {
5255       const unsigned char *p = src;
5256       int i = 0;
5257
5258       if (multibytep)
5259         {
5260           while (i < 1024 && p < src_end)
5261             {
5262               source_byteidx[i] = p - src;
5263               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5264             }
5265           source_byteidx[i] = p - src;
5266         }
5267       else
5268         while (i < 1024 && p < src_end)
5269           source_charbuf[i++] = *p++;
5270
5271       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5272         ccl->last_block = 1;
5273       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5274                   charset_list);
5275       charbuf += ccl->produced;
5276       if (multibytep)
5277         src += source_byteidx[ccl->consumed];
5278       else
5279         src += ccl->consumed;
5280       consumed_chars += ccl->consumed;
5281       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5282         break;
5283     }
5284
5285   switch (ccl->status)
5286     {
5287     case CCL_STAT_SUSPEND_BY_SRC:
5288       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5289       break;
5290     case CCL_STAT_SUSPEND_BY_DST:
5291       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5292       break;
5293     case CCL_STAT_QUIT:
5294     case CCL_STAT_INVALID_CMD:
5295       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5296       break;
5297     default:
5298       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5299       break;
5300     }
5301   coding->consumed_char += consumed_chars;
5302   coding->consumed = src - coding->source;
5303   coding->charbuf_used = charbuf - coding->charbuf;
5304 }
5305
5306 static int
5307 encode_coding_ccl (coding)
5308      struct coding_system *coding;
5309 {
5310   struct ccl_program ccl;
5311   int multibytep = coding->dst_multibyte;
5312   int *charbuf = coding->charbuf;
5313   int *charbuf_end = charbuf + coding->charbuf_used;
5314   unsigned char *dst = coding->destination + coding->produced;
5315   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5316   int destination_charbuf[1024];
5317   int i, produced_chars = 0;
5318   Lisp_Object attrs, charset_list;
5319
5320   CODING_GET_INFO (coding, attrs, charset_list);
5321   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
5322
5323   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
5324   ccl.dst_multibyte = coding->dst_multibyte;
5325
5326   while (charbuf < charbuf_end)
5327     {
5328       ccl_driver (&ccl, charbuf, destination_charbuf,
5329                   charbuf_end - charbuf, 1024, charset_list);
5330       if (multibytep)
5331         {
5332           ASSURE_DESTINATION (ccl.produced * 2);
5333           for (i = 0; i < ccl.produced; i++)
5334             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5335         }
5336       else
5337         {
5338           ASSURE_DESTINATION (ccl.produced);
5339           for (i = 0; i < ccl.produced; i++)
5340             *dst++ = destination_charbuf[i] & 0xFF;
5341           produced_chars += ccl.produced;
5342         }
5343       charbuf += ccl.consumed;
5344       if (ccl.status == CCL_STAT_QUIT
5345           || ccl.status == CCL_STAT_INVALID_CMD)
5346         break;
5347     }
5348
5349   switch (ccl.status)
5350     {
5351     case CCL_STAT_SUSPEND_BY_SRC:
5352       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5353       break;
5354     case CCL_STAT_SUSPEND_BY_DST:
5355       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5356       break;
5357     case CCL_STAT_QUIT:
5358     case CCL_STAT_INVALID_CMD:
5359       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5360       break;
5361     default:
5362       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5363       break;
5364     }
5365
5366   coding->produced_char += produced_chars;
5367   coding->produced = dst - coding->destination;
5368   return 0;
5369 }
5370
5371
5372 \f
5373 /*** 10, 11. no-conversion handlers ***/
5374
5375 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5376
5377 static void
5378 decode_coding_raw_text (coding)
5379      struct coding_system *coding;
5380 {
5381   int eol_crlf =
5382     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5383
5384   coding->chars_at_source = 1;
5385   coding->consumed_char = coding->src_chars;
5386   coding->consumed = coding->src_bytes;
5387   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5388     {
5389       coding->consumed_char--;
5390       coding->consumed--;
5391       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5392     }
5393   else
5394     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5395 }
5396
5397 static int
5398 encode_coding_raw_text (coding)
5399      struct coding_system *coding;
5400 {
5401   int multibytep = coding->dst_multibyte;
5402   int *charbuf = coding->charbuf;
5403   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5404   unsigned char *dst = coding->destination + coding->produced;
5405   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5406   int produced_chars = 0;
5407   int c;
5408
5409   if (multibytep)
5410     {
5411       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5412
5413       if (coding->src_multibyte)
5414         while (charbuf < charbuf_end)
5415           {
5416             ASSURE_DESTINATION (safe_room);
5417             c = *charbuf++;
5418             if (ASCII_CHAR_P (c))
5419               EMIT_ONE_ASCII_BYTE (c);
5420             else if (CHAR_BYTE8_P (c))
5421               {
5422                 c = CHAR_TO_BYTE8 (c);
5423                 EMIT_ONE_BYTE (c);
5424               }
5425             else
5426               {
5427                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5428
5429                 CHAR_STRING_ADVANCE (c, p1);
5430                 while (p0 < p1)
5431                   {
5432                     EMIT_ONE_BYTE (*p0);
5433                     p0++;
5434                   }
5435               }
5436           }
5437       else
5438         while (charbuf < charbuf_end)
5439           {
5440             ASSURE_DESTINATION (safe_room);
5441             c = *charbuf++;
5442             EMIT_ONE_BYTE (c);
5443           }
5444     }
5445   else
5446     {
5447       if (coding->src_multibyte)
5448         {
5449           int safe_room = MAX_MULTIBYTE_LENGTH;
5450
5451           while (charbuf < charbuf_end)
5452             {
5453               ASSURE_DESTINATION (safe_room);
5454               c = *charbuf++;
5455               if (ASCII_CHAR_P (c))
5456                 *dst++ = c;
5457               else if (CHAR_BYTE8_P (c))
5458                 *dst++ = CHAR_TO_BYTE8 (c);
5459               else
5460                 CHAR_STRING_ADVANCE (c, dst);
5461             }
5462         }
5463       else
5464         {
5465           ASSURE_DESTINATION (charbuf_end - charbuf);
5466           while (charbuf < charbuf_end && dst < dst_end)
5467             *dst++ = *charbuf++;
5468         }
5469       produced_chars = dst - (coding->destination + coding->produced);
5470     }
5471   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5472   coding->produced_char += produced_chars;
5473   coding->produced = dst - coding->destination;
5474   return 0;
5475 }
5476
5477 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5478    Check if a text is encoded in a charset-based coding system.  If it
5479    is, return 1, else return 0.  */
5480
5481 static int
5482 detect_coding_charset (coding, detect_info)
5483      struct coding_system *coding;
5484      struct coding_detection_info *detect_info;
5485 {
5486   const unsigned char *src = coding->source, *src_base;
5487   const unsigned char *src_end = coding->source + coding->src_bytes;
5488   int multibytep = coding->src_multibyte;
5489   int consumed_chars = 0;
5490   Lisp_Object attrs, valids, name;
5491   int found = 0;
5492   int head_ascii = coding->head_ascii;
5493   int check_latin_extra = 0;
5494
5495   detect_info->checked |= CATEGORY_MASK_CHARSET;
5496
5497   coding = &coding_categories[coding_category_charset];
5498   attrs = CODING_ID_ATTRS (coding->id);
5499   valids = AREF (attrs, coding_attr_charset_valids);
5500   name = CODING_ID_NAME (coding->id);
5501   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5502                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5503       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5504                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5505     check_latin_extra = 1;
5506
5507   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5508     src += head_ascii;
5509
5510   while (1)
5511     {
5512       int c;
5513       Lisp_Object val;
5514       struct charset *charset;
5515       int dim, idx;
5516
5517       src_base = src;
5518       ONE_MORE_BYTE (c);
5519       if (c < 0)
5520         continue;
5521       val = AREF (valids, c);
5522       if (NILP (val))
5523         break;
5524       if (c >= 0x80)
5525         {
5526           if (c < 0xA0
5527               && check_latin_extra
5528               && (!VECTORP (Vlatin_extra_code_table)
5529                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5530             break;
5531           found = CATEGORY_MASK_CHARSET;
5532         }
5533       if (INTEGERP (val))
5534         {
5535           charset = CHARSET_FROM_ID (XFASTINT (val));
5536           dim = CHARSET_DIMENSION (charset);
5537           for (idx = 1; idx < dim; idx++)
5538             {
5539               if (src == src_end)
5540                 goto too_short;
5541               ONE_MORE_BYTE (c);
5542               if (c < charset->code_space[(dim - 1 - idx) * 2]
5543                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5544                 break;
5545             }
5546           if (idx < dim)
5547             break;
5548         }
5549       else
5550         {
5551           idx = 1;
5552           for (; CONSP (val); val = XCDR (val))
5553             {
5554               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5555               dim = CHARSET_DIMENSION (charset);
5556               while (idx < dim)
5557                 {
5558                   if (src == src_end)
5559                     goto too_short;
5560                   ONE_MORE_BYTE (c);
5561                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5562                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5563                     break;
5564                   idx++;
5565                 }
5566               if (idx == dim)
5567                 {
5568                   val = Qnil;
5569                   break;
5570                 }
5571             }
5572           if (CONSP (val))
5573             break;
5574         }
5575     }
5576  too_short:
5577   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5578   return 0;
5579
5580  no_more_source:
5581   detect_info->found |= found;
5582   return 1;
5583 }
5584
5585 static void
5586 decode_coding_charset (coding)
5587      struct coding_system *coding;
5588 {
5589   const unsigned char *src = coding->source + coding->consumed;
5590   const unsigned char *src_end = coding->source + coding->src_bytes;
5591   const unsigned char *src_base;
5592   int *charbuf = coding->charbuf + coding->charbuf_used;
5593   /* We may produce one charset annocation in one loop and one more at
5594      the end.  */
5595   int *charbuf_end
5596     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5597   int consumed_chars = 0, consumed_chars_base;
5598   int multibytep = coding->src_multibyte;
5599   Lisp_Object attrs, charset_list, valids;
5600   int char_offset = coding->produced_char;
5601   int last_offset = char_offset;
5602   int last_id = charset_ascii;
5603   int eol_crlf =
5604     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5605   int byte_after_cr = -1;
5606
5607   CODING_GET_INFO (coding, attrs, charset_list);
5608   valids = AREF (attrs, coding_attr_charset_valids);
5609
5610   while (1)
5611     {
5612       int c;
5613       Lisp_Object val;
5614       struct charset *charset;
5615       int dim;
5616       int len = 1;
5617       unsigned code;
5618
5619       src_base = src;
5620       consumed_chars_base = consumed_chars;
5621
5622       if (charbuf >= charbuf_end)
5623         {
5624           if (byte_after_cr >= 0)
5625             src_base--;
5626           break;
5627         }
5628
5629       if (byte_after_cr >= 0)
5630         {
5631           c = byte_after_cr;
5632           byte_after_cr = -1;
5633         }
5634       else
5635         {
5636           ONE_MORE_BYTE (c);
5637           if (eol_crlf && c == '\r')
5638             ONE_MORE_BYTE (byte_after_cr);
5639         }
5640       if (c < 0)
5641         goto invalid_code;
5642       code = c;
5643
5644       val = AREF (valids, c);
5645       if (! INTEGERP (val) && ! CONSP (val))
5646         goto invalid_code;
5647       if (INTEGERP (val))
5648         {
5649           charset = CHARSET_FROM_ID (XFASTINT (val));
5650           dim = CHARSET_DIMENSION (charset);
5651           while (len < dim)
5652             {
5653               ONE_MORE_BYTE (c);
5654               code = (code << 8) | c;
5655               len++;
5656             }
5657           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5658                               charset, code, c);
5659         }
5660       else
5661         {
5662           /* VAL is a list of charset IDs.  It is assured that the
5663              list is sorted by charset dimensions (smaller one
5664              comes first).  */
5665           while (CONSP (val))
5666             {
5667               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5668               dim = CHARSET_DIMENSION (charset);
5669               while (len < dim)
5670                 {
5671                   ONE_MORE_BYTE (c);
5672                   code = (code << 8) | c;
5673                   len++;
5674                 }
5675               CODING_DECODE_CHAR (coding, src, src_base,
5676                                   src_end, charset, code, c);
5677               if (c >= 0)
5678                 break;
5679               val = XCDR (val);
5680             }
5681         }
5682       if (c < 0)
5683         goto invalid_code;
5684       if (charset->id != charset_ascii
5685           && last_id != charset->id)
5686         {
5687           if (last_id != charset_ascii)
5688             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5689           last_id = charset->id;
5690           last_offset = char_offset;
5691         }
5692
5693       *charbuf++ = c;
5694       char_offset++;
5695       continue;
5696
5697     invalid_code:
5698       src = src_base;
5699       consumed_chars = consumed_chars_base;
5700       ONE_MORE_BYTE (c);
5701       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5702       char_offset++;
5703       coding->errors++;
5704     }
5705
5706  no_more_source:
5707   if (last_id != charset_ascii)
5708     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5709   coding->consumed_char += consumed_chars_base;
5710   coding->consumed = src_base - coding->source;
5711   coding->charbuf_used = charbuf - coding->charbuf;
5712 }
5713
5714 static int
5715 encode_coding_charset (coding)
5716      struct coding_system *coding;
5717 {
5718   int multibytep = coding->dst_multibyte;
5719   int *charbuf = coding->charbuf;
5720   int *charbuf_end = charbuf + coding->charbuf_used;
5721   unsigned char *dst = coding->destination + coding->produced;
5722   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5723   int safe_room = MAX_MULTIBYTE_LENGTH;
5724   int produced_chars = 0;
5725   Lisp_Object attrs, charset_list;
5726   int ascii_compatible;
5727   int c;
5728
5729   CODING_GET_INFO (coding, attrs, charset_list);
5730   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5731
5732   while (charbuf < charbuf_end)
5733     {
5734       struct charset *charset;
5735       unsigned code;
5736
5737       ASSURE_DESTINATION (safe_room);
5738       c = *charbuf++;
5739       if (ascii_compatible && ASCII_CHAR_P (c))
5740         EMIT_ONE_ASCII_BYTE (c);
5741       else if (CHAR_BYTE8_P (c))
5742         {
5743           c = CHAR_TO_BYTE8 (c);
5744           EMIT_ONE_BYTE (c);
5745         }
5746       else
5747         {
5748           charset = char_charset (c, charset_list, &code);
5749           if (charset)
5750             {
5751               if (CHARSET_DIMENSION (charset) == 1)
5752                 EMIT_ONE_BYTE (code);
5753               else if (CHARSET_DIMENSION (charset) == 2)
5754                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5755               else if (CHARSET_DIMENSION (charset) == 3)
5756                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5757               else
5758                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5759                                  (code >> 8) & 0xFF, code & 0xFF);
5760             }
5761           else
5762             {
5763               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5764                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5765               else
5766                 c = coding->default_char;
5767               EMIT_ONE_BYTE (c);
5768             }
5769         }
5770     }
5771
5772   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5773   coding->produced_char += produced_chars;
5774   coding->produced = dst - coding->destination;
5775   return 0;
5776 }
5777
5778 \f
5779 /*** 7. C library functions ***/
5780
5781 /* Setup coding context CODING from information about CODING_SYSTEM.
5782    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5783    CODING_SYSTEM is invalid, signal an error.  */
5784
5785 void
5786 setup_coding_system (coding_system, coding)
5787      Lisp_Object coding_system;
5788      struct coding_system *coding;
5789 {
5790   Lisp_Object attrs;
5791   Lisp_Object eol_type;
5792   Lisp_Object coding_type;
5793   Lisp_Object val;
5794
5795   if (NILP (coding_system))
5796     coding_system = Qundecided;
5797
5798   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5799
5800   attrs = CODING_ID_ATTRS (coding->id);
5801   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5802
5803   coding->mode = 0;
5804   coding->head_ascii = -1;
5805   if (VECTORP (eol_type))
5806     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5807                             | CODING_REQUIRE_DETECTION_MASK);
5808   else if (! EQ (eol_type, Qunix))
5809     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5810                             | CODING_REQUIRE_ENCODING_MASK);
5811   else
5812     coding->common_flags = 0;
5813   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5814     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5815   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5816     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5817   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5818     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5819
5820   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5821   coding->max_charset_id = SCHARS (val) - 1;
5822   coding->safe_charsets = SDATA (val);
5823   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5824   coding->carryover_bytes = 0;
5825
5826   coding_type = CODING_ATTR_TYPE (attrs);
5827   if (EQ (coding_type, Qundecided))
5828     {
5829       coding->detector = NULL;
5830       coding->decoder = decode_coding_raw_text;
5831       coding->encoder = encode_coding_raw_text;
5832       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5833     }
5834   else if (EQ (coding_type, Qiso_2022))
5835     {
5836       int i;
5837       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5838
5839       /* Invoke graphic register 0 to plane 0.  */
5840       CODING_ISO_INVOCATION (coding, 0) = 0;
5841       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5842       CODING_ISO_INVOCATION (coding, 1)
5843         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5844       /* Setup the initial status of designation.  */
5845       for (i = 0; i < 4; i++)
5846         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5847       /* Not single shifting initially.  */
5848       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5849       /* Beginning of buffer should also be regarded as bol. */
5850       CODING_ISO_BOL (coding) = 1;
5851       coding->detector = detect_coding_iso_2022;
5852       coding->decoder = decode_coding_iso_2022;
5853       coding->encoder = encode_coding_iso_2022;
5854       if (flags & CODING_ISO_FLAG_SAFE)
5855         coding->mode |= CODING_MODE_SAFE_ENCODING;
5856       coding->common_flags
5857         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5858             | CODING_REQUIRE_FLUSHING_MASK);
5859       if (flags & CODING_ISO_FLAG_COMPOSITION)
5860         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5861       if (flags & CODING_ISO_FLAG_DESIGNATION)
5862         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5863       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5864         {
5865           setup_iso_safe_charsets (attrs);
5866           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5867           coding->max_charset_id = SCHARS (val) - 1;
5868           coding->safe_charsets = SDATA (val);
5869         }
5870       CODING_ISO_FLAGS (coding) = flags;
5871       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5872       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5873       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5874       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5875     }
5876   else if (EQ (coding_type, Qcharset))
5877     {
5878       coding->detector = detect_coding_charset;
5879       coding->decoder = decode_coding_charset;
5880       coding->encoder = encode_coding_charset;
5881       coding->common_flags
5882         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5883     }
5884   else if (EQ (coding_type, Qutf_8))
5885     {
5886       val = AREF (attrs, coding_attr_utf_bom);
5887       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5888                                    : EQ (val, Qt) ? utf_with_bom
5889                                    : utf_without_bom);
5890       coding->detector = detect_coding_utf_8;
5891       coding->decoder = decode_coding_utf_8;
5892       coding->encoder = encode_coding_utf_8;
5893       coding->common_flags
5894         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5895       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5896         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5897     }
5898   else if (EQ (coding_type, Qutf_16))
5899     {
5900       val = AREF (attrs, coding_attr_utf_bom);
5901       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5902                                     : EQ (val, Qt) ? utf_with_bom
5903                                     : utf_without_bom);
5904       val = AREF (attrs, coding_attr_utf_16_endian);
5905       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5906                                        : utf_16_little_endian);
5907       CODING_UTF_16_SURROGATE (coding) = 0;
5908       coding->detector = detect_coding_utf_16;
5909       coding->decoder = decode_coding_utf_16;
5910       coding->encoder = encode_coding_utf_16;
5911       coding->common_flags
5912         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5913       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5914         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5915     }
5916   else if (EQ (coding_type, Qccl))
5917     {
5918       coding->detector = detect_coding_ccl;
5919       coding->decoder = decode_coding_ccl;
5920       coding->encoder = encode_coding_ccl;
5921       coding->common_flags
5922         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5923             | CODING_REQUIRE_FLUSHING_MASK);
5924     }
5925   else if (EQ (coding_type, Qemacs_mule))
5926     {
5927       coding->detector = detect_coding_emacs_mule;
5928       coding->decoder = decode_coding_emacs_mule;
5929       coding->encoder = encode_coding_emacs_mule;
5930       coding->common_flags
5931         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5932       coding->spec.emacs_mule.full_support = 1;
5933       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5934           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5935         {
5936           Lisp_Object tail, safe_charsets;
5937           int max_charset_id = 0;
5938
5939           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5940                tail = XCDR (tail))
5941             if (max_charset_id < XFASTINT (XCAR (tail)))
5942               max_charset_id = XFASTINT (XCAR (tail));
5943           safe_charsets = make_uninit_string (max_charset_id + 1);
5944           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5945           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5946                tail = XCDR (tail))
5947             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5948           coding->max_charset_id = max_charset_id;
5949           coding->safe_charsets = SDATA (safe_charsets);
5950           coding->spec.emacs_mule.full_support = 1;
5951         }
5952       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5953       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5954     }
5955   else if (EQ (coding_type, Qshift_jis))
5956     {
5957       coding->detector = detect_coding_sjis;
5958       coding->decoder = decode_coding_sjis;
5959       coding->encoder = encode_coding_sjis;
5960       coding->common_flags
5961         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5962     }
5963   else if (EQ (coding_type, Qbig5))
5964     {
5965       coding->detector = detect_coding_big5;
5966       coding->decoder = decode_coding_big5;
5967       coding->encoder = encode_coding_big5;
5968       coding->common_flags
5969         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5970     }
5971   else                          /* EQ (coding_type, Qraw_text) */
5972     {
5973       coding->detector = NULL;
5974       coding->decoder = decode_coding_raw_text;
5975       coding->encoder = encode_coding_raw_text;
5976       if (! EQ (eol_type, Qunix))
5977         {
5978           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5979           if (! VECTORP (eol_type))
5980             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5981         }
5982
5983     }
5984
5985   return;
5986 }
5987
5988 /* Return a list of charsets supported by CODING.  */
5989
5990 Lisp_Object
5991 coding_charset_list (coding)
5992      struct coding_system *coding;
5993 {
5994   Lisp_Object attrs, charset_list;
5995
5996   CODING_GET_INFO (coding, attrs, charset_list);
5997   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5998     {
5999       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6000
6001       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6002         charset_list = Viso_2022_charset_list;
6003     }
6004   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6005     {
6006       charset_list = Vemacs_mule_charset_list;
6007     }
6008   return charset_list;
6009 }
6010
6011
6012 /* Return a list of charsets supported by CODING-SYSTEM.  */
6013
6014 Lisp_Object
6015 coding_system_charset_list (coding_system)
6016      Lisp_Object coding_system;
6017 {
6018   int id;
6019   Lisp_Object attrs, charset_list;
6020
6021   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6022   attrs = CODING_ID_ATTRS (id);
6023
6024   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6025     {
6026       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6027
6028       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6029         charset_list = Viso_2022_charset_list;
6030       else
6031         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6032     }
6033   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6034     {
6035       charset_list = Vemacs_mule_charset_list;
6036     }
6037   else
6038     {
6039       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6040     }
6041   return charset_list;
6042 }
6043
6044
6045 /* Return raw-text or one of its subsidiaries that has the same
6046    eol_type as CODING-SYSTEM.  */
6047
6048 Lisp_Object
6049 raw_text_coding_system (coding_system)
6050      Lisp_Object coding_system;
6051 {
6052   Lisp_Object spec, attrs;
6053   Lisp_Object eol_type, raw_text_eol_type;
6054
6055   if (NILP (coding_system))
6056     return Qraw_text;
6057   spec = CODING_SYSTEM_SPEC (coding_system);
6058   attrs = AREF (spec, 0);
6059
6060   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6061     return coding_system;
6062
6063   eol_type = AREF (spec, 2);
6064   if (VECTORP (eol_type))
6065     return Qraw_text;
6066   spec = CODING_SYSTEM_SPEC (Qraw_text);
6067   raw_text_eol_type = AREF (spec, 2);
6068   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6069           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6070           : AREF (raw_text_eol_type, 2));
6071 }
6072
6073
6074 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6075    does, return one of the subsidiary that has the same eol-spec as
6076    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
6077    inherit end-of-line format from the system's setting
6078    (system_eol_type).  */
6079
6080 Lisp_Object
6081 coding_inherit_eol_type (coding_system, parent)
6082      Lisp_Object coding_system, parent;
6083 {
6084   Lisp_Object spec, eol_type;
6085
6086   if (NILP (coding_system))
6087     coding_system = Qraw_text;
6088   spec = CODING_SYSTEM_SPEC (coding_system);
6089   eol_type = AREF (spec, 2);
6090   if (VECTORP (eol_type))
6091     {
6092       Lisp_Object parent_eol_type;
6093
6094       if (! NILP (parent))
6095         {
6096           Lisp_Object parent_spec;
6097
6098           parent_spec = CODING_SYSTEM_SPEC (parent);
6099           parent_eol_type = AREF (parent_spec, 2);
6100         }
6101       else
6102         parent_eol_type = system_eol_type;
6103       if (EQ (parent_eol_type, Qunix))
6104         coding_system = AREF (eol_type, 0);
6105       else if (EQ (parent_eol_type, Qdos))
6106         coding_system = AREF (eol_type, 1);
6107       else if (EQ (parent_eol_type, Qmac))
6108         coding_system = AREF (eol_type, 2);
6109     }
6110   return coding_system;
6111 }
6112
6113 /* Emacs has a mechanism to automatically detect a coding system if it
6114    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6115    it's impossible to distinguish some coding systems accurately
6116    because they use the same range of codes.  So, at first, coding
6117    systems are categorized into 7, those are:
6118
6119    o coding-category-emacs-mule
6120
6121         The category for a coding system which has the same code range
6122         as Emacs' internal format.  Assigned the coding-system (Lisp
6123         symbol) `emacs-mule' by default.
6124
6125    o coding-category-sjis
6126
6127         The category for a coding system which has the same code range
6128         as SJIS.  Assigned the coding-system (Lisp
6129         symbol) `japanese-shift-jis' by default.
6130
6131    o coding-category-iso-7
6132
6133         The category for a coding system which has the same code range
6134         as ISO2022 of 7-bit environment.  This doesn't use any locking
6135         shift and single shift functions.  This can encode/decode all
6136         charsets.  Assigned the coding-system (Lisp symbol)
6137         `iso-2022-7bit' by default.
6138
6139    o coding-category-iso-7-tight
6140
6141         Same as coding-category-iso-7 except that this can
6142         encode/decode only the specified charsets.
6143
6144    o coding-category-iso-8-1
6145
6146         The category for a coding system which has the same code range
6147         as ISO2022 of 8-bit environment and graphic plane 1 used only
6148         for DIMENSION1 charset.  This doesn't use any locking shift
6149         and single shift functions.  Assigned the coding-system (Lisp
6150         symbol) `iso-latin-1' by default.
6151
6152    o coding-category-iso-8-2
6153
6154         The category for a coding system which has the same code range
6155         as ISO2022 of 8-bit environment and graphic plane 1 used only
6156         for DIMENSION2 charset.  This doesn't use any locking shift
6157         and single shift functions.  Assigned the coding-system (Lisp
6158         symbol) `japanese-iso-8bit' by default.
6159
6160    o coding-category-iso-7-else
6161
6162         The category for a coding system which has the same code range
6163         as ISO2022 of 7-bit environemnt but uses locking shift or
6164         single shift functions.  Assigned the coding-system (Lisp
6165         symbol) `iso-2022-7bit-lock' by default.
6166
6167    o coding-category-iso-8-else
6168
6169         The category for a coding system which has the same code range
6170         as ISO2022 of 8-bit environemnt but uses locking shift or
6171         single shift functions.  Assigned the coding-system (Lisp
6172         symbol) `iso-2022-8bit-ss2' by default.
6173
6174    o coding-category-big5
6175
6176         The category for a coding system which has the same code range
6177         as BIG5.  Assigned the coding-system (Lisp symbol)
6178         `cn-big5' by default.
6179
6180    o coding-category-utf-8
6181
6182         The category for a coding system which has the same code range
6183         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6184         symbol) `utf-8' by default.
6185
6186    o coding-category-utf-16-be
6187
6188         The category for a coding system in which a text has an
6189         Unicode signature (cf. Unicode Standard) in the order of BIG
6190         endian at the head.  Assigned the coding-system (Lisp symbol)
6191         `utf-16-be' by default.
6192
6193    o coding-category-utf-16-le
6194
6195         The category for a coding system in which a text has an
6196         Unicode signature (cf. Unicode Standard) in the order of
6197         LITTLE endian at the head.  Assigned the coding-system (Lisp
6198         symbol) `utf-16-le' by default.
6199
6200    o coding-category-ccl
6201
6202         The category for a coding system of which encoder/decoder is
6203         written in CCL programs.  The default value is nil, i.e., no
6204         coding system is assigned.
6205
6206    o coding-category-binary
6207
6208         The category for a coding system not categorized in any of the
6209         above.  Assigned the coding-system (Lisp symbol)
6210         `no-conversion' by default.
6211
6212    Each of them is a Lisp symbol and the value is an actual
6213    `coding-system's (this is also a Lisp symbol) assigned by a user.
6214    What Emacs does actually is to detect a category of coding system.
6215    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6216    decide only one possible category, it selects a category of the
6217    highest priority.  Priorities of categories are also specified by a
6218    user in a Lisp variable `coding-category-list'.
6219
6220 */
6221
6222 #define EOL_SEEN_NONE   0
6223 #define EOL_SEEN_LF     1
6224 #define EOL_SEEN_CR     2
6225 #define EOL_SEEN_CRLF   4
6226
6227 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6228    SOURCE is encoded.  If CATEGORY is one of
6229    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6230    two-byte, else they are encoded by one-byte.
6231
6232    Return one of EOL_SEEN_XXX.  */
6233
6234 #define MAX_EOL_CHECK_COUNT 3
6235
6236 static int
6237 detect_eol (source, src_bytes, category)
6238      const unsigned char *source;
6239      EMACS_INT src_bytes;
6240      enum coding_category category;
6241 {
6242   const unsigned char *src = source, *src_end = src + src_bytes;
6243   unsigned char c;
6244   int total  = 0;
6245   int eol_seen = EOL_SEEN_NONE;
6246
6247   if ((1 << category) & CATEGORY_MASK_UTF_16)
6248     {
6249       int msb, lsb;
6250
6251       msb = category == (coding_category_utf_16_le
6252                          | coding_category_utf_16_le_nosig);
6253       lsb = 1 - msb;
6254
6255       while (src + 1 < src_end)
6256         {
6257           c = src[lsb];
6258           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6259             {
6260               int this_eol;
6261
6262               if (c == '\n')
6263                 this_eol = EOL_SEEN_LF;
6264               else if (src + 3 >= src_end
6265                        || src[msb + 2] != 0
6266                        || src[lsb + 2] != '\n')
6267                 this_eol = EOL_SEEN_CR;
6268               else
6269                 {
6270                   this_eol = EOL_SEEN_CRLF;
6271                   src += 2;
6272                 }
6273
6274               if (eol_seen == EOL_SEEN_NONE)
6275                 /* This is the first end-of-line.  */
6276                 eol_seen = this_eol;
6277               else if (eol_seen != this_eol)
6278                 {
6279                   /* The found type is different from what found before.
6280                      Allow for stray ^M characters in DOS EOL files.  */
6281                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6282                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6283                     eol_seen = EOL_SEEN_CRLF;
6284                   else
6285                     {
6286                       eol_seen = EOL_SEEN_LF;
6287                       break;
6288                     }
6289                 }
6290               if (++total == MAX_EOL_CHECK_COUNT)
6291                 break;
6292             }
6293           src += 2;
6294         }
6295     }
6296   else
6297     {
6298       while (src < src_end)
6299         {
6300           c = *src++;
6301           if (c == '\n' || c == '\r')
6302             {
6303               int this_eol;
6304
6305               if (c == '\n')
6306                 this_eol = EOL_SEEN_LF;
6307               else if (src >= src_end || *src != '\n')
6308                 this_eol = EOL_SEEN_CR;
6309               else
6310                 this_eol = EOL_SEEN_CRLF, src++;
6311
6312               if (eol_seen == EOL_SEEN_NONE)
6313                 /* This is the first end-of-line.  */
6314                 eol_seen = this_eol;
6315               else if (eol_seen != this_eol)
6316                 {
6317                   /* The found type is different from what found before.
6318                      Allow for stray ^M characters in DOS EOL files.  */
6319                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6320                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6321                     eol_seen = EOL_SEEN_CRLF;
6322                   else
6323                     {
6324                       eol_seen = EOL_SEEN_LF;
6325                       break;
6326                     }
6327                 }
6328               if (++total == MAX_EOL_CHECK_COUNT)
6329                 break;
6330             }
6331         }
6332     }
6333   return eol_seen;
6334 }
6335
6336
6337 static Lisp_Object
6338 adjust_coding_eol_type (coding, eol_seen)
6339      struct coding_system *coding;
6340      int eol_seen;
6341 {
6342   Lisp_Object eol_type;
6343
6344   eol_type = CODING_ID_EOL_TYPE (coding->id);
6345   if (eol_seen & EOL_SEEN_LF)
6346     {
6347       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6348       eol_type = Qunix;
6349     }
6350   else if (eol_seen & EOL_SEEN_CRLF)
6351     {
6352       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6353       eol_type = Qdos;
6354     }
6355   else if (eol_seen & EOL_SEEN_CR)
6356     {
6357       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6358       eol_type = Qmac;
6359     }
6360   return eol_type;
6361 }
6362
6363 /* Detect how a text specified in CODING is encoded.  If a coding
6364    system is detected, update fields of CODING by the detected coding
6365    system.  */
6366
6367 void
6368 detect_coding (coding)
6369      struct coding_system *coding;
6370 {
6371   const unsigned char *src, *src_end;
6372   int saved_mode = coding->mode;
6373
6374   coding->consumed = coding->consumed_char = 0;
6375   coding->produced = coding->produced_char = 0;
6376   coding_set_source (coding);
6377
6378   src_end = coding->source + coding->src_bytes;
6379   coding->head_ascii = 0;
6380
6381   /* If we have not yet decided the text encoding type, detect it
6382      now.  */
6383   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6384     {
6385       int c, i;
6386       struct coding_detection_info detect_info;
6387       int null_byte_found = 0, eight_bit_found = 0;
6388
6389       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6390       for (src = coding->source; src < src_end; src++)
6391         {
6392           c = *src;
6393           if (c & 0x80)
6394             {
6395               eight_bit_found = 1;
6396               if (null_byte_found)
6397                 break;
6398             }
6399           else if (c < 0x20)
6400             {
6401               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6402                   && ! inhibit_iso_escape_detection
6403                   && ! detect_info.checked)
6404                 {
6405                   if (detect_coding_iso_2022 (coding, &detect_info))
6406                     {
6407                       /* We have scanned the whole data.  */
6408                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6409                         {
6410                           /* We didn't find an 8-bit code.  We may
6411                              have found a null-byte, but it's very
6412                              rare that a binary file confirm to
6413                              ISO-2022.  */
6414                           src = src_end;
6415                           coding->head_ascii = src - coding->source;
6416                         }
6417                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6418                       break;
6419                     }
6420                 }
6421               else if (! c && !inhibit_null_byte_detection)
6422                 {
6423                   null_byte_found = 1;
6424                   if (eight_bit_found)
6425                     break;
6426                 }
6427               if (! eight_bit_found)
6428                 coding->head_ascii++;
6429             }
6430           else if (! eight_bit_found)
6431             coding->head_ascii++;
6432         }
6433
6434       if (null_byte_found || eight_bit_found
6435           || coding->head_ascii < coding->src_bytes
6436           || detect_info.found)
6437         {
6438           enum coding_category category;
6439           struct coding_system *this;
6440
6441           if (coding->head_ascii == coding->src_bytes)
6442             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6443             for (i = 0; i < coding_category_raw_text; i++)
6444               {
6445                 category = coding_priorities[i];
6446                 this = coding_categories + category;
6447                 if (detect_info.found & (1 << category))
6448                   break;
6449               }
6450           else
6451             {
6452               if (null_byte_found)
6453                 {
6454                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6455                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6456                 }
6457               for (i = 0; i < coding_category_raw_text; i++)
6458                 {
6459                   category = coding_priorities[i];
6460                   this = coding_categories + category;
6461                   if (this->id < 0)
6462                     {
6463                       /* No coding system of this category is defined.  */
6464                       detect_info.rejected |= (1 << category);
6465                     }
6466                   else if (category >= coding_category_raw_text)
6467                     continue;
6468                   else if (detect_info.checked & (1 << category))
6469                     {
6470                       if (detect_info.found & (1 << category))
6471                         break;
6472                     }
6473                   else if ((*(this->detector)) (coding, &detect_info)
6474                            && detect_info.found & (1 << category))
6475                     {
6476                       if (category == coding_category_utf_16_auto)
6477                         {
6478                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6479                             category = coding_category_utf_16_le;
6480                           else
6481                             category = coding_category_utf_16_be;
6482                         }
6483                       break;
6484                     }
6485                 }
6486             }
6487
6488           if (i < coding_category_raw_text)
6489             setup_coding_system (CODING_ID_NAME (this->id), coding);
6490           else if (null_byte_found)
6491             setup_coding_system (Qno_conversion, coding);
6492           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6493                    == CATEGORY_MASK_ANY)
6494             setup_coding_system (Qraw_text, coding);
6495           else if (detect_info.rejected)
6496             for (i = 0; i < coding_category_raw_text; i++)
6497               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6498                 {
6499                   this = coding_categories + coding_priorities[i];
6500                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6501                   break;
6502                 }
6503         }
6504     }
6505   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6506            == coding_category_utf_8_auto)
6507     {
6508       Lisp_Object coding_systems;
6509       struct coding_detection_info detect_info;
6510
6511       coding_systems
6512         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6513       detect_info.found = detect_info.rejected = 0;
6514       coding->head_ascii = 0;
6515       if (CONSP (coding_systems)
6516           && detect_coding_utf_8 (coding, &detect_info))
6517         {
6518           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6519             setup_coding_system (XCAR (coding_systems), coding);
6520           else
6521             setup_coding_system (XCDR (coding_systems), coding);
6522         }
6523     }
6524   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6525            == coding_category_utf_16_auto)
6526     {
6527       Lisp_Object coding_systems;
6528       struct coding_detection_info detect_info;
6529
6530       coding_systems
6531         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6532       detect_info.found = detect_info.rejected = 0;
6533       coding->head_ascii = 0;
6534       if (CONSP (coding_systems)
6535           && detect_coding_utf_16 (coding, &detect_info))
6536         {
6537           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6538             setup_coding_system (XCAR (coding_systems), coding);
6539           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6540             setup_coding_system (XCDR (coding_systems), coding);
6541         }
6542     }
6543   coding->mode = saved_mode;
6544 }
6545
6546
6547 static void
6548 decode_eol (coding)
6549      struct coding_system *coding;
6550 {
6551   Lisp_Object eol_type;
6552   unsigned char *p, *pbeg, *pend;
6553
6554   eol_type = CODING_ID_EOL_TYPE (coding->id);
6555   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6556     return;
6557
6558   if (NILP (coding->dst_object))
6559     pbeg = coding->destination;
6560   else
6561     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6562   pend = pbeg + coding->produced;
6563
6564   if (VECTORP (eol_type))
6565     {
6566       int eol_seen = EOL_SEEN_NONE;
6567
6568       for (p = pbeg; p < pend; p++)
6569         {
6570           if (*p == '\n')
6571             eol_seen |= EOL_SEEN_LF;
6572           else if (*p == '\r')
6573             {
6574               if (p + 1 < pend && *(p + 1) == '\n')
6575                 {
6576                   eol_seen |= EOL_SEEN_CRLF;
6577                   p++;
6578                 }
6579               else
6580                 eol_seen |= EOL_SEEN_CR;
6581             }
6582         }
6583       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6584       if ((eol_seen & EOL_SEEN_CRLF) != 0
6585           && (eol_seen & EOL_SEEN_CR) != 0
6586           && (eol_seen & EOL_SEEN_LF) == 0)
6587         eol_seen = EOL_SEEN_CRLF;
6588       else if (eol_seen != EOL_SEEN_NONE
6589           && eol_seen != EOL_SEEN_LF
6590           && eol_seen != EOL_SEEN_CRLF
6591           && eol_seen != EOL_SEEN_CR)
6592         eol_seen = EOL_SEEN_LF;
6593       if (eol_seen != EOL_SEEN_NONE)
6594         eol_type = adjust_coding_eol_type (coding, eol_seen);
6595     }
6596
6597   if (EQ (eol_type, Qmac))
6598     {
6599       for (p = pbeg; p < pend; p++)
6600         if (*p == '\r')
6601           *p = '\n';
6602     }
6603   else if (EQ (eol_type, Qdos))
6604     {
6605       int n = 0;
6606
6607       if (NILP (coding->dst_object))
6608         {
6609           /* Start deleting '\r' from the tail to minimize the memory
6610              movement.  */
6611           for (p = pend - 2; p >= pbeg; p--)
6612             if (*p == '\r')
6613               {
6614                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6615                 n++;
6616               }
6617         }
6618       else
6619         {
6620           int pos_byte = coding->dst_pos_byte;
6621           int pos = coding->dst_pos;
6622           int pos_end = pos + coding->produced_char - 1;
6623
6624           while (pos < pos_end)
6625             {
6626               p = BYTE_POS_ADDR (pos_byte);
6627               if (*p == '\r' && p[1] == '\n')
6628                 {
6629                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6630                   n++;
6631                   pos_end--;
6632                 }
6633               pos++;
6634               if (coding->dst_multibyte)
6635                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6636               else
6637                 pos_byte++;
6638             }
6639         }
6640       coding->produced -= n;
6641       coding->produced_char -= n;
6642     }
6643 }
6644
6645
6646 /* Return a translation table (or list of them) from coding system
6647    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6648    decoding (ENCODEP is zero). */
6649
6650 static Lisp_Object
6651 get_translation_table (attrs, encodep, max_lookup)
6652      Lisp_Object attrs;
6653      int encodep, *max_lookup;
6654 {
6655   Lisp_Object standard, translation_table;
6656   Lisp_Object val;
6657
6658   if (NILP (Venable_character_translation))
6659     {
6660       if (max_lookup)
6661         *max_lookup = 0;
6662       return Qnil;
6663     }
6664   if (encodep)
6665     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6666       standard = Vstandard_translation_table_for_encode;
6667   else
6668     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6669       standard = Vstandard_translation_table_for_decode;
6670   if (NILP (translation_table))
6671     translation_table = standard;
6672   else
6673     {
6674       if (SYMBOLP (translation_table))
6675         translation_table = Fget (translation_table, Qtranslation_table);
6676       else if (CONSP (translation_table))
6677         {
6678           translation_table = Fcopy_sequence (translation_table);
6679           for (val = translation_table; CONSP (val); val = XCDR (val))
6680             if (SYMBOLP (XCAR (val)))
6681               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6682         }
6683       if (CHAR_TABLE_P (standard))
6684         {
6685           if (CONSP (translation_table))
6686             translation_table = nconc2 (translation_table,
6687                                         Fcons (standard, Qnil));
6688           else
6689             translation_table = Fcons (translation_table,
6690                                        Fcons (standard, Qnil));
6691         }
6692     }
6693
6694   if (max_lookup)
6695     {
6696       *max_lookup = 1;
6697       if (CHAR_TABLE_P (translation_table)
6698           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6699         {
6700           val = XCHAR_TABLE (translation_table)->extras[1];
6701           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6702             *max_lookup = XFASTINT (val);
6703         }
6704       else if (CONSP (translation_table))
6705         {
6706           Lisp_Object tail, val;
6707
6708           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6709             if (CHAR_TABLE_P (XCAR (tail))
6710                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6711               {
6712                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6713                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6714                   *max_lookup = XFASTINT (val);
6715               }
6716         }
6717     }
6718   return translation_table;
6719 }
6720
6721 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6722   do {                                                          \
6723     trans = Qnil;                                               \
6724     if (CHAR_TABLE_P (table))                                   \
6725       {                                                         \
6726         trans = CHAR_TABLE_REF (table, c);                      \
6727         if (CHARACTERP (trans))                                 \
6728           c = XFASTINT (trans), trans = Qnil;                   \
6729       }                                                         \
6730     else if (CONSP (table))                                     \
6731       {                                                         \
6732         Lisp_Object tail;                                       \
6733                                                                 \
6734         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6735           if (CHAR_TABLE_P (XCAR (tail)))                       \
6736             {                                                   \
6737               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6738               if (CHARACTERP (trans))                           \
6739                 c = XFASTINT (trans), trans = Qnil;             \
6740               else if (! NILP (trans))                          \
6741                 break;                                          \
6742             }                                                   \
6743       }                                                         \
6744   } while (0)
6745
6746
6747 /* Return a translation of character(s) at BUF according to TRANS.
6748    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6749    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6750    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6751    translation is found, and Qnil if not found..
6752    If BUF is too short to lookup characters in FROM, return Qt.  */
6753
6754 static Lisp_Object
6755 get_translation (trans, buf, buf_end)
6756      Lisp_Object trans;
6757      int *buf, *buf_end;
6758 {
6759
6760   if (INTEGERP (trans))
6761     return trans;
6762   for (; CONSP (trans); trans = XCDR (trans))
6763     {
6764       Lisp_Object val = XCAR (trans);
6765       Lisp_Object from = XCAR (val);
6766       int len = ASIZE (from);
6767       int i;
6768
6769       for (i = 0; i < len; i++)
6770         {
6771           if (buf + i == buf_end)
6772             return Qt;
6773           if (XINT (AREF (from, i)) != buf[i])
6774             break;
6775         }
6776       if (i == len)
6777         return val;
6778     }
6779   return Qnil;
6780 }
6781
6782
6783 static int
6784 produce_chars (coding, translation_table, last_block)
6785      struct coding_system *coding;
6786      Lisp_Object translation_table;
6787      int last_block;
6788 {
6789   unsigned char *dst = coding->destination + coding->produced;
6790   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6791   EMACS_INT produced;
6792   EMACS_INT produced_chars = 0;
6793   int carryover = 0;
6794
6795   if (! coding->chars_at_source)
6796     {
6797       /* Source characters are in coding->charbuf.  */
6798       int *buf = coding->charbuf;
6799       int *buf_end = buf + coding->charbuf_used;
6800
6801       if (EQ (coding->src_object, coding->dst_object))
6802         {
6803           coding_set_source (coding);
6804           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6805         }
6806
6807       while (buf < buf_end)
6808         {
6809           int c = *buf, i;
6810
6811           if (c >= 0)
6812             {
6813               int from_nchars = 1, to_nchars = 1;
6814               Lisp_Object trans = Qnil;
6815
6816               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6817               if (! NILP (trans))
6818                 {
6819                   trans = get_translation (trans, buf, buf_end);
6820                   if (INTEGERP (trans))
6821                     c = XINT (trans);
6822                   else if (CONSP (trans))
6823                     {
6824                       from_nchars = ASIZE (XCAR (trans));
6825                       trans = XCDR (trans);
6826                       if (INTEGERP (trans))
6827                         c = XINT (trans);
6828                       else
6829                         {
6830                           to_nchars = ASIZE (trans);
6831                           c = XINT (AREF (trans, 0));
6832                         }
6833                     }
6834                   else if (EQ (trans, Qt) && ! last_block)
6835                     break;
6836                 }
6837
6838               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6839                 {
6840                   dst = alloc_destination (coding,
6841                                            buf_end - buf
6842                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6843                                            dst);
6844                   if (EQ (coding->src_object, coding->dst_object))
6845                     {
6846                       coding_set_source (coding);
6847                       dst_end = (((unsigned char *) coding->source)
6848                                  + coding->consumed);
6849                     }
6850                   else
6851                     dst_end = coding->destination + coding->dst_bytes;
6852                 }
6853
6854               for (i = 0; i < to_nchars; i++)
6855                 {
6856                   if (i > 0)
6857                     c = XINT (AREF (trans, i));
6858                   if (coding->dst_multibyte
6859                       || ! CHAR_BYTE8_P (c))
6860                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6861                   else
6862                     *dst++ = CHAR_TO_BYTE8 (c);
6863                 }
6864               produced_chars += to_nchars;
6865               buf += from_nchars;
6866             }
6867           else
6868             /* This is an annotation datum.  (-C) is the length.  */
6869             buf += -c;
6870         }
6871       carryover = buf_end - buf;
6872     }
6873   else
6874     {
6875       /* Source characters are at coding->source.  */
6876       const unsigned char *src = coding->source;
6877       const unsigned char *src_end = src + coding->consumed;
6878
6879       if (EQ (coding->dst_object, coding->src_object))
6880         dst_end = (unsigned char *) src;
6881       if (coding->src_multibyte != coding->dst_multibyte)
6882         {
6883           if (coding->src_multibyte)
6884             {
6885               int multibytep = 1;
6886               EMACS_INT consumed_chars = 0;
6887
6888               while (1)
6889                 {
6890                   const unsigned char *src_base = src;
6891                   int c;
6892
6893                   ONE_MORE_BYTE (c);
6894                   if (dst == dst_end)
6895                     {
6896                       if (EQ (coding->src_object, coding->dst_object))
6897                         dst_end = (unsigned char *) src;
6898                       if (dst == dst_end)
6899                         {
6900                           EMACS_INT offset = src - coding->source;
6901
6902                           dst = alloc_destination (coding, src_end - src + 1,
6903                                                    dst);
6904                           dst_end = coding->destination + coding->dst_bytes;
6905                           coding_set_source (coding);
6906                           src = coding->source + offset;
6907                           src_end = coding->source + coding->src_bytes;
6908                           if (EQ (coding->src_object, coding->dst_object))
6909                             dst_end = (unsigned char *) src;
6910                         }
6911                     }
6912                   *dst++ = c;
6913                   produced_chars++;
6914                 }
6915             no_more_source:
6916               ;
6917             }
6918           else
6919             while (src < src_end)
6920               {
6921                 int multibytep = 1;
6922                 int c = *src++;
6923
6924                 if (dst >= dst_end - 1)
6925                   {
6926                     if (EQ (coding->src_object, coding->dst_object))
6927                       dst_end = (unsigned char *) src;
6928                     if (dst >= dst_end - 1)
6929                       {
6930                         EMACS_INT offset = src - coding->source;
6931                         EMACS_INT more_bytes;
6932
6933                         if (EQ (coding->src_object, coding->dst_object))
6934                           more_bytes = ((src_end - src) / 2) + 2;
6935                         else
6936                           more_bytes = src_end - src + 2;
6937                         dst = alloc_destination (coding, more_bytes, dst);
6938                         dst_end = coding->destination + coding->dst_bytes;
6939                         coding_set_source (coding);
6940                         src = coding->source + offset;
6941                         src_end = coding->source + coding->src_bytes;
6942                         if (EQ (coding->src_object, coding->dst_object))
6943                           dst_end = (unsigned char *) src;
6944                       }
6945                   }
6946                 EMIT_ONE_BYTE (c);
6947               }
6948         }
6949       else
6950         {
6951           if (!EQ (coding->src_object, coding->dst_object))
6952             {
6953               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6954
6955               if (require > 0)
6956                 {
6957                   EMACS_INT offset = src - coding->source;
6958
6959                   dst = alloc_destination (coding, require, dst);
6960                   coding_set_source (coding);
6961                   src = coding->source + offset;
6962                   src_end = coding->source + coding->src_bytes;
6963                 }
6964             }
6965           produced_chars = coding->consumed_char;
6966           while (src < src_end)
6967             *dst++ = *src++;
6968         }
6969     }
6970
6971   produced = dst - (coding->destination + coding->produced);
6972   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6973     insert_from_gap (produced_chars, produced);
6974   coding->produced += produced;
6975   coding->produced_char += produced_chars;
6976   return carryover;
6977 }
6978
6979 /* Compose text in CODING->object according to the annotation data at
6980    CHARBUF.  CHARBUF is an array:
6981      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6982  */
6983
6984 static INLINE void
6985 produce_composition (coding, charbuf, pos)
6986      struct coding_system *coding;
6987      int *charbuf;
6988      EMACS_INT pos;
6989 {
6990   int len;
6991   EMACS_INT to;
6992   enum composition_method method;
6993   Lisp_Object components;
6994
6995   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6996   to = pos + charbuf[2];
6997   method = (enum composition_method) (charbuf[4]);
6998
6999   if (method == COMPOSITION_RELATIVE)
7000     components = Qnil;
7001   else
7002     {
7003       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7004       int i, j;
7005
7006       if (method == COMPOSITION_WITH_RULE)
7007         len = charbuf[2] * 3 - 2;
7008       charbuf += MAX_ANNOTATION_LENGTH;
7009       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7010       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7011         {
7012           if (charbuf[i] >= 0)
7013             args[j] = make_number (charbuf[i]);
7014           else
7015             {
7016               i++;
7017               args[j] = make_number (charbuf[i] % 0x100);
7018             }
7019         }
7020       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7021     }
7022   compose_text (pos, to, components, Qnil, coding->dst_object);
7023 }
7024
7025
7026 /* Put `charset' property on text in CODING->object according to
7027    the annotation data at CHARBUF.  CHARBUF is an array:
7028      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7029  */
7030
7031 static INLINE void
7032 produce_charset (coding, charbuf, pos)
7033      struct coding_system *coding;
7034      int *charbuf;
7035      EMACS_INT pos;
7036 {
7037   EMACS_INT from = pos - charbuf[2];
7038   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7039
7040   Fput_text_property (make_number (from), make_number (pos),
7041                       Qcharset, CHARSET_NAME (charset),
7042                       coding->dst_object);
7043 }
7044
7045
7046 #define CHARBUF_SIZE 0x4000
7047
7048 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7049   do {                                                                  \
7050     int size = CHARBUF_SIZE;                                            \
7051                                                                         \
7052     coding->charbuf = NULL;                                             \
7053     while (size > 1024)                                                 \
7054       {                                                                 \
7055         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7056         if (coding->charbuf)                                            \
7057           break;                                                        \
7058         size >>= 1;                                                     \
7059       }                                                                 \
7060     if (! coding->charbuf)                                              \
7061       {                                                                 \
7062         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7063         return coding->result;                                          \
7064       }                                                                 \
7065     coding->charbuf_size = size;                                        \
7066   } while (0)
7067
7068
7069 static void
7070 produce_annotation (coding, pos)
7071      struct coding_system *coding;
7072      EMACS_INT pos;
7073 {
7074   int *charbuf = coding->charbuf;
7075   int *charbuf_end = charbuf + coding->charbuf_used;
7076
7077   if (NILP (coding->dst_object))
7078     return;
7079
7080   while (charbuf < charbuf_end)
7081     {
7082       if (*charbuf >= 0)
7083         pos++, charbuf++;
7084       else
7085         {
7086           int len = -*charbuf;
7087
7088           if (len > 2)
7089             switch (charbuf[1])
7090               {
7091               case CODING_ANNOTATE_COMPOSITION_MASK:
7092                 produce_composition (coding, charbuf, pos);
7093                 break;
7094               case CODING_ANNOTATE_CHARSET_MASK:
7095                 produce_charset (coding, charbuf, pos);
7096                 break;
7097               }
7098           charbuf += len;
7099         }
7100     }
7101 }
7102
7103 /* Decode the data at CODING->src_object into CODING->dst_object.
7104    CODING->src_object is a buffer, a string, or nil.
7105    CODING->dst_object is a buffer.
7106
7107    If CODING->src_object is a buffer, it must be the current buffer.
7108    In this case, if CODING->src_pos is positive, it is a position of
7109    the source text in the buffer, otherwise, the source text is in the
7110    gap area of the buffer, and CODING->src_pos specifies the offset of
7111    the text from GPT (which must be the same as PT).  If this is the
7112    same buffer as CODING->dst_object, CODING->src_pos must be
7113    negative.
7114
7115    If CODING->src_object is a string, CODING->src_pos is an index to
7116    that string.
7117
7118    If CODING->src_object is nil, CODING->source must already point to
7119    the non-relocatable memory area.  In this case, CODING->src_pos is
7120    an offset from CODING->source.
7121
7122    The decoded data is inserted at the current point of the buffer
7123    CODING->dst_object.
7124 */
7125
7126 static int
7127 decode_coding (coding)
7128      struct coding_system *coding;
7129 {
7130   Lisp_Object attrs;
7131   Lisp_Object undo_list;
7132   Lisp_Object translation_table;
7133   struct ccl_spec cclspec;
7134   int carryover;
7135   int i;
7136
7137   if (BUFFERP (coding->src_object)
7138       && coding->src_pos > 0
7139       && coding->src_pos < GPT
7140       && coding->src_pos + coding->src_chars > GPT)
7141     move_gap_both (coding->src_pos, coding->src_pos_byte);
7142
7143   undo_list = Qt;
7144   if (BUFFERP (coding->dst_object))
7145     {
7146       if (current_buffer != XBUFFER (coding->dst_object))
7147         set_buffer_internal (XBUFFER (coding->dst_object));
7148       if (GPT != PT)
7149         move_gap_both (PT, PT_BYTE);
7150       undo_list = current_buffer->undo_list;
7151       current_buffer->undo_list = Qt;
7152     }
7153
7154   coding->consumed = coding->consumed_char = 0;
7155   coding->produced = coding->produced_char = 0;
7156   coding->chars_at_source = 0;
7157   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7158   coding->errors = 0;
7159
7160   ALLOC_CONVERSION_WORK_AREA (coding);
7161
7162   attrs = CODING_ID_ATTRS (coding->id);
7163   translation_table = get_translation_table (attrs, 0, NULL);
7164
7165   carryover = 0;
7166   if (coding->decoder == decode_coding_ccl)
7167     {
7168       coding->spec.ccl = &cclspec;
7169       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7170     }
7171   do
7172     {
7173       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7174
7175       coding_set_source (coding);
7176       coding->annotated = 0;
7177       coding->charbuf_used = carryover;
7178       (*(coding->decoder)) (coding);
7179       coding_set_destination (coding);
7180       carryover = produce_chars (coding, translation_table, 0);
7181       if (coding->annotated)
7182         produce_annotation (coding, pos);
7183       for (i = 0; i < carryover; i++)
7184         coding->charbuf[i]
7185           = coding->charbuf[coding->charbuf_used - carryover + i];
7186     }
7187   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7188          || (coding->consumed < coding->src_bytes
7189              && (coding->result == CODING_RESULT_SUCCESS
7190                  || coding->result == CODING_RESULT_INVALID_SRC)));
7191
7192   if (carryover > 0)
7193     {
7194       coding_set_destination (coding);
7195       coding->charbuf_used = carryover;
7196       produce_chars (coding, translation_table, 1);
7197     }
7198
7199   coding->carryover_bytes = 0;
7200   if (coding->consumed < coding->src_bytes)
7201     {
7202       int nbytes = coding->src_bytes - coding->consumed;
7203       const unsigned char *src;
7204
7205       coding_set_source (coding);
7206       coding_set_destination (coding);
7207       src = coding->source + coding->consumed;
7208
7209       if (coding->mode & CODING_MODE_LAST_BLOCK)
7210         {
7211           /* Flush out unprocessed data as binary chars.  We are sure
7212              that the number of data is less than the size of
7213              coding->charbuf.  */
7214           coding->charbuf_used = 0;
7215           coding->chars_at_source = 0;
7216
7217           while (nbytes-- > 0)
7218             {
7219               int c = *src++;
7220
7221               if (c & 0x80)
7222                 c = BYTE8_TO_CHAR (c);
7223               coding->charbuf[coding->charbuf_used++] = c;
7224             }
7225           produce_chars (coding, Qnil, 1);
7226         }
7227       else
7228         {
7229           /* Record unprocessed bytes in coding->carryover.  We are
7230              sure that the number of data is less than the size of
7231              coding->carryover.  */
7232           unsigned char *p = coding->carryover;
7233
7234           if (nbytes > sizeof coding->carryover)
7235             nbytes = sizeof coding->carryover;
7236           coding->carryover_bytes = nbytes;
7237           while (nbytes-- > 0)
7238             *p++ = *src++;
7239         }
7240       coding->consumed = coding->src_bytes;
7241     }
7242
7243   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7244       && !inhibit_eol_conversion)
7245     decode_eol (coding);
7246   if (BUFFERP (coding->dst_object))
7247     {
7248       current_buffer->undo_list = undo_list;
7249       record_insert (coding->dst_pos, coding->produced_char);
7250     }
7251   return coding->result;
7252 }
7253
7254
7255 /* Extract an annotation datum from a composition starting at POS and
7256    ending before LIMIT of CODING->src_object (buffer or string), store
7257    the data in BUF, set *STOP to a starting position of the next
7258    composition (if any) or to LIMIT, and return the address of the
7259    next element of BUF.
7260
7261    If such an annotation is not found, set *STOP to a starting
7262    position of a composition after POS (if any) or to LIMIT, and
7263    return BUF.  */
7264
7265 static INLINE int *
7266 handle_composition_annotation (pos, limit, coding, buf, stop)
7267      EMACS_INT pos, limit;
7268      struct coding_system *coding;
7269      int *buf;
7270      EMACS_INT *stop;
7271 {
7272   EMACS_INT start, end;
7273   Lisp_Object prop;
7274
7275   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7276       || end > limit)
7277     *stop = limit;
7278   else if (start > pos)
7279     *stop = start;
7280   else
7281     {
7282       if (start == pos)
7283         {
7284           /* We found a composition.  Store the corresponding
7285              annotation data in BUF.  */
7286           int *head = buf;
7287           enum composition_method method = COMPOSITION_METHOD (prop);
7288           int nchars = COMPOSITION_LENGTH (prop);
7289
7290           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7291           if (method != COMPOSITION_RELATIVE)
7292             {
7293               Lisp_Object components;
7294               int len, i, i_byte;
7295
7296               components = COMPOSITION_COMPONENTS (prop);
7297               if (VECTORP (components))
7298                 {
7299                   len = XVECTOR (components)->size;
7300                   for (i = 0; i < len; i++)
7301                     *buf++ = XINT (AREF (components, i));
7302                 }
7303               else if (STRINGP (components))
7304                 {
7305                   len = SCHARS (components);
7306                   i = i_byte = 0;
7307                   while (i < len)
7308                     {
7309                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7310                       buf++;
7311                     }
7312                 }
7313               else if (INTEGERP (components))
7314                 {
7315                   len = 1;
7316                   *buf++ = XINT (components);
7317                 }
7318               else if (CONSP (components))
7319                 {
7320                   for (len = 0; CONSP (components);
7321                        len++, components = XCDR (components))
7322                     *buf++ = XINT (XCAR (components));
7323                 }
7324               else
7325                 abort ();
7326               *head -= len;
7327             }
7328         }
7329
7330       if (find_composition (end, limit, &start, &end, &prop,
7331                             coding->src_object)
7332           && end <= limit)
7333         *stop = start;
7334       else
7335         *stop = limit;
7336     }
7337   return buf;
7338 }
7339
7340
7341 /* Extract an annotation datum from a text property `charset' at POS of
7342    CODING->src_object (buffer of string), store the data in BUF, set
7343    *STOP to the position where the value of `charset' property changes
7344    (limiting by LIMIT), and return the address of the next element of
7345    BUF.
7346
7347    If the property value is nil, set *STOP to the position where the
7348    property value is non-nil (limiting by LIMIT), and return BUF.  */
7349
7350 static INLINE int *
7351 handle_charset_annotation (pos, limit, coding, buf, stop)
7352      EMACS_INT pos, limit;
7353      struct coding_system *coding;
7354      int *buf;
7355      EMACS_INT *stop;
7356 {
7357   Lisp_Object val, next;
7358   int id;
7359
7360   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7361   if (! NILP (val) && CHARSETP (val))
7362     id = XINT (CHARSET_SYMBOL_ID (val));
7363   else
7364     id = -1;
7365   ADD_CHARSET_DATA (buf, 0, id);
7366   next = Fnext_single_property_change (make_number (pos), Qcharset,
7367                                        coding->src_object,
7368                                        make_number (limit));
7369   *stop = XINT (next);
7370   return buf;
7371 }
7372
7373
7374 static void
7375 consume_chars (coding, translation_table, max_lookup)
7376      struct coding_system *coding;
7377      Lisp_Object translation_table;
7378      int max_lookup;
7379 {
7380   int *buf = coding->charbuf;
7381   int *buf_end = coding->charbuf + coding->charbuf_size;
7382   const unsigned char *src = coding->source + coding->consumed;
7383   const unsigned char *src_end = coding->source + coding->src_bytes;
7384   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7385   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7386   int multibytep = coding->src_multibyte;
7387   Lisp_Object eol_type;
7388   int c;
7389   EMACS_INT stop, stop_composition, stop_charset;
7390   int *lookup_buf = NULL;
7391
7392   if (! NILP (translation_table))
7393     lookup_buf = alloca (sizeof (int) * max_lookup);
7394
7395   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7396   if (VECTORP (eol_type))
7397     eol_type = Qunix;
7398
7399   /* Note: composition handling is not yet implemented.  */
7400   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7401
7402   if (NILP (coding->src_object))
7403     stop = stop_composition = stop_charset = end_pos;
7404   else
7405     {
7406       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7407         stop = stop_composition = pos;
7408       else
7409         stop = stop_composition = end_pos;
7410       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7411         stop = stop_charset = pos;
7412       else
7413         stop_charset = end_pos;
7414     }
7415
7416   /* Compensate for CRLF and conversion.  */
7417   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7418   while (buf < buf_end)
7419     {
7420       Lisp_Object trans;
7421
7422       if (pos == stop)
7423         {
7424           if (pos == end_pos)
7425             break;
7426           if (pos == stop_composition)
7427             buf = handle_composition_annotation (pos, end_pos, coding,
7428                                                  buf, &stop_composition);
7429           if (pos == stop_charset)
7430             buf = handle_charset_annotation (pos, end_pos, coding,
7431                                              buf, &stop_charset);
7432           stop = (stop_composition < stop_charset
7433                   ? stop_composition : stop_charset);
7434         }
7435
7436       if (! multibytep)
7437         {
7438           EMACS_INT bytes;
7439
7440           if (coding->encoder == encode_coding_raw_text
7441               || coding->encoder == encode_coding_ccl)
7442             c = *src++, pos++;
7443           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7444             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7445           else
7446             c = BYTE8_TO_CHAR (*src), src++, pos++;
7447         }
7448       else
7449         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7450       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7451         c = '\n';
7452       if (! EQ (eol_type, Qunix))
7453         {
7454           if (c == '\n')
7455             {
7456               if (EQ (eol_type, Qdos))
7457                 *buf++ = '\r';
7458               else
7459                 c = '\r';
7460             }
7461         }
7462
7463       trans = Qnil;
7464       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7465       if (NILP (trans))
7466         *buf++ = c;
7467       else
7468         {
7469           int from_nchars = 1, to_nchars = 1;
7470           int *lookup_buf_end;
7471           const unsigned char *p = src;
7472           int i;
7473
7474           lookup_buf[0] = c;
7475           for (i = 1; i < max_lookup && p < src_end; i++)
7476             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7477           lookup_buf_end = lookup_buf + i;
7478           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7479           if (INTEGERP (trans))
7480             c = XINT (trans);
7481           else if (CONSP (trans))
7482             {
7483               from_nchars = ASIZE (XCAR (trans));
7484               trans = XCDR (trans);
7485               if (INTEGERP (trans))
7486                 c = XINT (trans);
7487               else
7488                 {
7489                   to_nchars = ASIZE (trans);
7490                   if (buf + to_nchars > buf_end)
7491                     break;
7492                   c = XINT (AREF (trans, 0));
7493                 }
7494             }
7495           else
7496             break;
7497           *buf++ = c;
7498           for (i = 1; i < to_nchars; i++)
7499             *buf++ = XINT (AREF (trans, i));
7500           for (i = 1; i < from_nchars; i++, pos++)
7501             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7502         }
7503     }
7504
7505   coding->consumed = src - coding->source;
7506   coding->consumed_char = pos - coding->src_pos;
7507   coding->charbuf_used = buf - coding->charbuf;
7508   coding->chars_at_source = 0;
7509 }
7510
7511
7512 /* Encode the text at CODING->src_object into CODING->dst_object.
7513    CODING->src_object is a buffer or a string.
7514    CODING->dst_object is a buffer or nil.
7515
7516    If CODING->src_object is a buffer, it must be the current buffer.
7517    In this case, if CODING->src_pos is positive, it is a position of
7518    the source text in the buffer, otherwise. the source text is in the
7519    gap area of the buffer, and coding->src_pos specifies the offset of
7520    the text from GPT (which must be the same as PT).  If this is the
7521    same buffer as CODING->dst_object, CODING->src_pos must be
7522    negative and CODING should not have `pre-write-conversion'.
7523
7524    If CODING->src_object is a string, CODING should not have
7525    `pre-write-conversion'.
7526
7527    If CODING->dst_object is a buffer, the encoded data is inserted at
7528    the current point of that buffer.
7529
7530    If CODING->dst_object is nil, the encoded data is placed at the
7531    memory area specified by CODING->destination.  */
7532
7533 static int
7534 encode_coding (coding)
7535      struct coding_system *coding;
7536 {
7537   Lisp_Object attrs;
7538   Lisp_Object translation_table;
7539   int max_lookup;
7540
7541   attrs = CODING_ID_ATTRS (coding->id);
7542   if (coding->encoder == encode_coding_raw_text)
7543     translation_table = Qnil, max_lookup = 0;
7544   else
7545     translation_table = get_translation_table (attrs, 1, &max_lookup);
7546
7547   if (BUFFERP (coding->dst_object))
7548     {
7549       set_buffer_internal (XBUFFER (coding->dst_object));
7550       coding->dst_multibyte
7551         = ! NILP (current_buffer->enable_multibyte_characters);
7552     }
7553
7554   coding->consumed = coding->consumed_char = 0;
7555   coding->produced = coding->produced_char = 0;
7556   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7557   coding->errors = 0;
7558
7559   ALLOC_CONVERSION_WORK_AREA (coding);
7560
7561   do {
7562     coding_set_source (coding);
7563     consume_chars (coding, translation_table, max_lookup);
7564     coding_set_destination (coding);
7565     (*(coding->encoder)) (coding);
7566   } while (coding->consumed_char < coding->src_chars);
7567
7568   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7569     insert_from_gap (coding->produced_char, coding->produced);
7570
7571   return (coding->result);
7572 }
7573
7574
7575 /* Name (or base name) of work buffer for code conversion.  */
7576 static Lisp_Object Vcode_conversion_workbuf_name;
7577
7578 /* A working buffer used by the top level conversion.  Once it is
7579    created, it is never destroyed.  It has the name
7580    Vcode_conversion_workbuf_name.  The other working buffers are
7581    destroyed after the use is finished, and their names are modified
7582    versions of Vcode_conversion_workbuf_name.  */
7583 static Lisp_Object Vcode_conversion_reused_workbuf;
7584
7585 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7586 static int reused_workbuf_in_use;
7587
7588
7589 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7590    multibyteness of returning buffer.  */
7591
7592 static Lisp_Object
7593 make_conversion_work_buffer (multibyte)
7594      int multibyte;
7595 {
7596   Lisp_Object name, workbuf;
7597   struct buffer *current;
7598
7599   if (reused_workbuf_in_use++)
7600     {
7601       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7602       workbuf = Fget_buffer_create (name);
7603     }
7604   else
7605     {
7606       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7607         Vcode_conversion_reused_workbuf
7608           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7609       workbuf = Vcode_conversion_reused_workbuf;
7610     }
7611   current = current_buffer;
7612   set_buffer_internal (XBUFFER (workbuf));
7613   /* We can't allow modification hooks to run in the work buffer.  For
7614      instance, directory_files_internal assumes that file decoding
7615      doesn't compile new regexps.  */
7616   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7617   Ferase_buffer ();
7618   current_buffer->undo_list = Qt;
7619   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7620   set_buffer_internal (current);
7621   return workbuf;
7622 }
7623
7624
7625 static Lisp_Object
7626 code_conversion_restore (arg)
7627      Lisp_Object arg;
7628 {
7629   Lisp_Object current, workbuf;
7630   struct gcpro gcpro1;
7631
7632   GCPRO1 (arg);
7633   current = XCAR (arg);
7634   workbuf = XCDR (arg);
7635   if (! NILP (workbuf))
7636     {
7637       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7638         reused_workbuf_in_use = 0;
7639       else if (! NILP (Fbuffer_live_p (workbuf)))
7640         Fkill_buffer (workbuf);
7641     }
7642   set_buffer_internal (XBUFFER (current));
7643   UNGCPRO;
7644   return Qnil;
7645 }
7646
7647 Lisp_Object
7648 code_conversion_save (with_work_buf, multibyte)
7649      int with_work_buf, multibyte;
7650 {
7651   Lisp_Object workbuf = Qnil;
7652
7653   if (with_work_buf)
7654     workbuf = make_conversion_work_buffer (multibyte);
7655   record_unwind_protect (code_conversion_restore,
7656                          Fcons (Fcurrent_buffer (), workbuf));
7657   return workbuf;
7658 }
7659
7660 int
7661 decode_coding_gap (coding, chars, bytes)
7662      struct coding_system *coding;
7663      EMACS_INT chars, bytes;
7664 {
7665   int count = specpdl_ptr - specpdl;
7666   Lisp_Object attrs;
7667
7668   code_conversion_save (0, 0);
7669
7670   coding->src_object = Fcurrent_buffer ();
7671   coding->src_chars = chars;
7672   coding->src_bytes = bytes;
7673   coding->src_pos = -chars;
7674   coding->src_pos_byte = -bytes;
7675   coding->src_multibyte = chars < bytes;
7676   coding->dst_object = coding->src_object;
7677   coding->dst_pos = PT;
7678   coding->dst_pos_byte = PT_BYTE;
7679   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7680
7681   if (CODING_REQUIRE_DETECTION (coding))
7682     detect_coding (coding);
7683
7684   coding->mode |= CODING_MODE_LAST_BLOCK;
7685   current_buffer->text->inhibit_shrinking = 1;
7686   decode_coding (coding);
7687   current_buffer->text->inhibit_shrinking = 0;
7688
7689   attrs = CODING_ID_ATTRS (coding->id);
7690   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7691     {
7692       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7693       Lisp_Object val;
7694
7695       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7696       val = call1 (CODING_ATTR_POST_READ (attrs),
7697                    make_number (coding->produced_char));
7698       CHECK_NATNUM (val);
7699       coding->produced_char += Z - prev_Z;
7700       coding->produced += Z_BYTE - prev_Z_BYTE;
7701     }
7702
7703   unbind_to (count, Qnil);
7704   return coding->result;
7705 }
7706
7707 int
7708 encode_coding_gap (coding, chars, bytes)
7709      struct coding_system *coding;
7710      EMACS_INT chars, bytes;
7711 {
7712   int count = specpdl_ptr - specpdl;
7713
7714   code_conversion_save (0, 0);
7715
7716   coding->src_object = Fcurrent_buffer ();
7717   coding->src_chars = chars;
7718   coding->src_bytes = bytes;
7719   coding->src_pos = -chars;
7720   coding->src_pos_byte = -bytes;
7721   coding->src_multibyte = chars < bytes;
7722   coding->dst_object = coding->src_object;
7723   coding->dst_pos = PT;
7724   coding->dst_pos_byte = PT_BYTE;
7725
7726   encode_coding (coding);
7727
7728   unbind_to (count, Qnil);
7729   return coding->result;
7730 }
7731
7732
7733 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7734    SRC_OBJECT into DST_OBJECT by coding context CODING.
7735
7736    SRC_OBJECT is a buffer, a string, or Qnil.
7737
7738    If it is a buffer, the text is at point of the buffer.  FROM and TO
7739    are positions in the buffer.
7740
7741    If it is a string, the text is at the beginning of the string.
7742    FROM and TO are indices to the string.
7743
7744    If it is nil, the text is at coding->source.  FROM and TO are
7745    indices to coding->source.
7746
7747    DST_OBJECT is a buffer, Qt, or Qnil.
7748
7749    If it is a buffer, the decoded text is inserted at point of the
7750    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7751    is deleted.
7752
7753    If it is Qt, a string is made from the decoded text, and
7754    set in CODING->dst_object.
7755
7756    If it is Qnil, the decoded text is stored at CODING->destination.
7757    The caller must allocate CODING->dst_bytes bytes at
7758    CODING->destination by xmalloc.  If the decoded text is longer than
7759    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7760  */
7761
7762 void
7763 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7764                       dst_object)
7765      struct coding_system *coding;
7766      Lisp_Object src_object;
7767      EMACS_INT from, from_byte, to, to_byte;
7768      Lisp_Object dst_object;
7769 {
7770   int count = specpdl_ptr - specpdl;
7771   unsigned char *destination;
7772   EMACS_INT dst_bytes;
7773   EMACS_INT chars = to - from;
7774   EMACS_INT bytes = to_byte - from_byte;
7775   Lisp_Object attrs;
7776   int saved_pt = -1, saved_pt_byte;
7777   int need_marker_adjustment = 0;
7778   Lisp_Object old_deactivate_mark;
7779
7780   old_deactivate_mark = Vdeactivate_mark;
7781
7782   if (NILP (dst_object))
7783     {
7784       destination = coding->destination;
7785       dst_bytes = coding->dst_bytes;
7786     }
7787
7788   coding->src_object = src_object;
7789   coding->src_chars = chars;
7790   coding->src_bytes = bytes;
7791   coding->src_multibyte = chars < bytes;
7792
7793   if (STRINGP (src_object))
7794     {
7795       coding->src_pos = from;
7796       coding->src_pos_byte = from_byte;
7797     }
7798   else if (BUFFERP (src_object))
7799     {
7800       set_buffer_internal (XBUFFER (src_object));
7801       if (from != GPT)
7802         move_gap_both (from, from_byte);
7803       if (EQ (src_object, dst_object))
7804         {
7805           struct Lisp_Marker *tail;
7806
7807           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7808             {
7809               tail->need_adjustment
7810                 = tail->charpos == (tail->insertion_type ? from : to);
7811               need_marker_adjustment |= tail->need_adjustment;
7812             }
7813           saved_pt = PT, saved_pt_byte = PT_BYTE;
7814           TEMP_SET_PT_BOTH (from, from_byte);
7815           current_buffer->text->inhibit_shrinking = 1;
7816           del_range_both (from, from_byte, to, to_byte, 1);
7817           coding->src_pos = -chars;
7818           coding->src_pos_byte = -bytes;
7819         }
7820       else
7821         {
7822           coding->src_pos = from;
7823           coding->src_pos_byte = from_byte;
7824         }
7825     }
7826
7827   if (CODING_REQUIRE_DETECTION (coding))
7828     detect_coding (coding);
7829   attrs = CODING_ID_ATTRS (coding->id);
7830
7831   if (EQ (dst_object, Qt)
7832       || (! NILP (CODING_ATTR_POST_READ (attrs))
7833           && NILP (dst_object)))
7834     {
7835       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7836       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7837       coding->dst_pos = BEG;
7838       coding->dst_pos_byte = BEG_BYTE;
7839     }
7840   else if (BUFFERP (dst_object))
7841     {
7842       code_conversion_save (0, 0);
7843       coding->dst_object = dst_object;
7844       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7845       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7846       coding->dst_multibyte
7847         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7848     }
7849   else
7850     {
7851       code_conversion_save (0, 0);
7852       coding->dst_object = Qnil;
7853       /* Most callers presume this will return a multibyte result, and they
7854          won't use `binary' or `raw-text' anyway, so let's not worry about
7855          CODING_FOR_UNIBYTE.  */
7856       coding->dst_multibyte = 1;
7857     }
7858
7859   decode_coding (coding);
7860
7861   if (BUFFERP (coding->dst_object))
7862     set_buffer_internal (XBUFFER (coding->dst_object));
7863
7864   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7865     {
7866       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7867       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7868       Lisp_Object val;
7869
7870       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7871       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7872               old_deactivate_mark);
7873       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7874                         make_number (coding->produced_char));
7875       UNGCPRO;
7876       CHECK_NATNUM (val);
7877       coding->produced_char += Z - prev_Z;
7878       coding->produced += Z_BYTE - prev_Z_BYTE;
7879     }
7880
7881   if (EQ (dst_object, Qt))
7882     {
7883       coding->dst_object = Fbuffer_string ();
7884     }
7885   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7886     {
7887       set_buffer_internal (XBUFFER (coding->dst_object));
7888       if (dst_bytes < coding->produced)
7889         {
7890           destination = xrealloc (destination, coding->produced);
7891           if (! destination)
7892             {
7893               record_conversion_result (coding,
7894                                         CODING_RESULT_INSUFFICIENT_MEM);
7895               unbind_to (count, Qnil);
7896               return;
7897             }
7898           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7899             move_gap_both (BEGV, BEGV_BYTE);
7900           bcopy (BEGV_ADDR, destination, coding->produced);
7901           coding->destination = destination;
7902         }
7903     }
7904
7905   if (saved_pt >= 0)
7906     {
7907       /* This is the case of:
7908          (BUFFERP (src_object) && EQ (src_object, dst_object))
7909          As we have moved PT while replacing the original buffer
7910          contents, we must recover it now.  */
7911       set_buffer_internal (XBUFFER (src_object));
7912       current_buffer->text->inhibit_shrinking = 0;
7913       if (saved_pt < from)
7914         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7915       else if (saved_pt < from + chars)
7916         TEMP_SET_PT_BOTH (from, from_byte);
7917       else if (! NILP (current_buffer->enable_multibyte_characters))
7918         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7919                           saved_pt_byte + (coding->produced - bytes));
7920       else
7921         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7922                           saved_pt_byte + (coding->produced - bytes));
7923
7924       if (need_marker_adjustment)
7925         {
7926           struct Lisp_Marker *tail;
7927
7928           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7929             if (tail->need_adjustment)
7930               {
7931                 tail->need_adjustment = 0;
7932                 if (tail->insertion_type)
7933                   {
7934                     tail->bytepos = from_byte;
7935                     tail->charpos = from;
7936                   }
7937                 else
7938                   {
7939                     tail->bytepos = from_byte + coding->produced;
7940                     tail->charpos
7941                       = (NILP (current_buffer->enable_multibyte_characters)
7942                          ? tail->bytepos : from + coding->produced_char);
7943                   }
7944               }
7945         }
7946     }
7947
7948   Vdeactivate_mark = old_deactivate_mark;
7949   unbind_to (count, coding->dst_object);
7950 }
7951
7952
7953 void
7954 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7955                       dst_object)
7956      struct coding_system *coding;
7957      Lisp_Object src_object;
7958      EMACS_INT from, from_byte, to, to_byte;
7959      Lisp_Object dst_object;
7960 {
7961   int count = specpdl_ptr - specpdl;
7962   EMACS_INT chars = to - from;
7963   EMACS_INT bytes = to_byte - from_byte;
7964   Lisp_Object attrs;
7965   int saved_pt = -1, saved_pt_byte;
7966   int need_marker_adjustment = 0;
7967   int kill_src_buffer = 0;
7968   Lisp_Object old_deactivate_mark;
7969
7970   old_deactivate_mark = Vdeactivate_mark;
7971
7972   coding->src_object = src_object;
7973   coding->src_chars = chars;
7974   coding->src_bytes = bytes;
7975   coding->src_multibyte = chars < bytes;
7976
7977   attrs = CODING_ID_ATTRS (coding->id);
7978
7979   if (EQ (src_object, dst_object))
7980     {
7981       struct Lisp_Marker *tail;
7982
7983       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7984         {
7985           tail->need_adjustment
7986             = tail->charpos == (tail->insertion_type ? from : to);
7987           need_marker_adjustment |= tail->need_adjustment;
7988         }
7989     }
7990
7991   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7992     {
7993       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7994       set_buffer_internal (XBUFFER (coding->src_object));
7995       if (STRINGP (src_object))
7996         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7997       else if (BUFFERP (src_object))
7998         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7999       else
8000         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
8001
8002       if (EQ (src_object, dst_object))
8003         {
8004           set_buffer_internal (XBUFFER (src_object));
8005           saved_pt = PT, saved_pt_byte = PT_BYTE;
8006           del_range_both (from, from_byte, to, to_byte, 1);
8007           set_buffer_internal (XBUFFER (coding->src_object));
8008         }
8009
8010       {
8011         Lisp_Object args[3];
8012         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8013
8014         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8015                 old_deactivate_mark);
8016         args[0] = CODING_ATTR_PRE_WRITE (attrs);
8017         args[1] = make_number (BEG);
8018         args[2] = make_number (Z);
8019         safe_call (3, args);
8020         UNGCPRO;
8021       }
8022       if (XBUFFER (coding->src_object) != current_buffer)
8023         kill_src_buffer = 1;
8024       coding->src_object = Fcurrent_buffer ();
8025       if (BEG != GPT)
8026         move_gap_both (BEG, BEG_BYTE);
8027       coding->src_chars = Z - BEG;
8028       coding->src_bytes = Z_BYTE - BEG_BYTE;
8029       coding->src_pos = BEG;
8030       coding->src_pos_byte = BEG_BYTE;
8031       coding->src_multibyte = Z < Z_BYTE;
8032     }
8033   else if (STRINGP (src_object))
8034     {
8035       code_conversion_save (0, 0);
8036       coding->src_pos = from;
8037       coding->src_pos_byte = from_byte;
8038     }
8039   else if (BUFFERP (src_object))
8040     {
8041       code_conversion_save (0, 0);
8042       set_buffer_internal (XBUFFER (src_object));
8043       if (EQ (src_object, dst_object))
8044         {
8045           saved_pt = PT, saved_pt_byte = PT_BYTE;
8046           coding->src_object = del_range_1 (from, to, 1, 1);
8047           coding->src_pos = 0;
8048           coding->src_pos_byte = 0;
8049         }
8050       else
8051         {
8052           if (from < GPT && to >= GPT)
8053             move_gap_both (from, from_byte);
8054           coding->src_pos = from;
8055           coding->src_pos_byte = from_byte;
8056         }
8057     }
8058   else
8059     code_conversion_save (0, 0);
8060
8061   if (BUFFERP (dst_object))
8062     {
8063       coding->dst_object = dst_object;
8064       if (EQ (src_object, dst_object))
8065         {
8066           coding->dst_pos = from;
8067           coding->dst_pos_byte = from_byte;
8068         }
8069       else
8070         {
8071           struct buffer *current = current_buffer;
8072
8073           set_buffer_temp (XBUFFER (dst_object));
8074           coding->dst_pos = PT;
8075           coding->dst_pos_byte = PT_BYTE;
8076           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8077           set_buffer_temp (current);
8078         }
8079       coding->dst_multibyte
8080         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8081     }
8082   else if (EQ (dst_object, Qt))
8083     {
8084       coding->dst_object = Qnil;
8085       coding->dst_bytes = coding->src_chars;
8086       if (coding->dst_bytes == 0)
8087         coding->dst_bytes = 1;
8088       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8089       coding->dst_multibyte = 0;
8090     }
8091   else
8092     {
8093       coding->dst_object = Qnil;
8094       coding->dst_multibyte = 0;
8095     }
8096
8097   encode_coding (coding);
8098
8099   if (EQ (dst_object, Qt))
8100     {
8101       if (BUFFERP (coding->dst_object))
8102         coding->dst_object = Fbuffer_string ();
8103       else
8104         {
8105           coding->dst_object
8106             = make_unibyte_string ((char *) coding->destination,
8107                                    coding->produced);
8108           xfree (coding->destination);
8109         }
8110     }
8111
8112   if (saved_pt >= 0)
8113     {
8114       /* This is the case of:
8115          (BUFFERP (src_object) && EQ (src_object, dst_object))
8116          As we have moved PT while replacing the original buffer
8117          contents, we must recover it now.  */
8118       set_buffer_internal (XBUFFER (src_object));
8119       if (saved_pt < from)
8120         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8121       else if (saved_pt < from + chars)
8122         TEMP_SET_PT_BOTH (from, from_byte);
8123       else if (! NILP (current_buffer->enable_multibyte_characters))
8124         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8125                           saved_pt_byte + (coding->produced - bytes));
8126       else
8127         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8128                           saved_pt_byte + (coding->produced - bytes));
8129
8130       if (need_marker_adjustment)
8131         {
8132           struct Lisp_Marker *tail;
8133
8134           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8135             if (tail->need_adjustment)
8136               {
8137                 tail->need_adjustment = 0;
8138                 if (tail->insertion_type)
8139                   {
8140                     tail->bytepos = from_byte;
8141                     tail->charpos = from;
8142                   }
8143                 else
8144                   {
8145                     tail->bytepos = from_byte + coding->produced;
8146                     tail->charpos
8147                       = (NILP (current_buffer->enable_multibyte_characters)
8148                          ? tail->bytepos : from + coding->produced_char);
8149                   }
8150               }
8151         }
8152     }
8153
8154   if (kill_src_buffer)
8155     Fkill_buffer (coding->src_object);
8156
8157   Vdeactivate_mark = old_deactivate_mark;
8158   unbind_to (count, Qnil);
8159 }
8160
8161
8162 Lisp_Object
8163 preferred_coding_system ()
8164 {
8165   int id = coding_categories[coding_priorities[0]].id;
8166
8167   return CODING_ID_NAME (id);
8168 }
8169
8170 \f
8171 #ifdef emacs
8172 /*** 8. Emacs Lisp library functions ***/
8173
8174 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8175        doc: /* Return t if OBJECT is nil or a coding-system.
8176 See the documentation of `define-coding-system' for information
8177 about coding-system objects.  */)
8178      (object)
8179      Lisp_Object object;
8180 {
8181   if (NILP (object)
8182       || CODING_SYSTEM_ID (object) >= 0)
8183     return Qt;
8184   if (! SYMBOLP (object)
8185       || NILP (Fget (object, Qcoding_system_define_form)))
8186     return Qnil;
8187   return Qt;
8188 }
8189
8190 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8191        Sread_non_nil_coding_system, 1, 1, 0,
8192        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8193      (prompt)
8194      Lisp_Object prompt;
8195 {
8196   Lisp_Object val;
8197   do
8198     {
8199       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8200                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8201     }
8202   while (SCHARS (val) == 0);
8203   return (Fintern (val, Qnil));
8204 }
8205
8206 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8207        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8208 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8209 Ignores case when completing coding systems (all Emacs coding systems
8210 are lower-case).  */)
8211      (prompt, default_coding_system)
8212      Lisp_Object prompt, default_coding_system;
8213 {
8214   Lisp_Object val;
8215   int count = SPECPDL_INDEX ();
8216
8217   if (SYMBOLP (default_coding_system))
8218     default_coding_system = SYMBOL_NAME (default_coding_system);
8219   specbind (Qcompletion_ignore_case, Qt);
8220   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8221                           Qt, Qnil, Qcoding_system_history,
8222                           default_coding_system, Qnil);
8223   unbind_to (count, Qnil);
8224   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8225 }
8226
8227 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8228        1, 1, 0,
8229        doc: /* Check validity of CODING-SYSTEM.
8230 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8231 It is valid if it is nil or a symbol defined as a coding system by the
8232 function `define-coding-system'.  */)
8233   (coding_system)
8234      Lisp_Object coding_system;
8235 {
8236   Lisp_Object define_form;
8237
8238   define_form = Fget (coding_system, Qcoding_system_define_form);
8239   if (! NILP (define_form))
8240     {
8241       Fput (coding_system, Qcoding_system_define_form, Qnil);
8242       safe_eval (define_form);
8243     }
8244   if (!NILP (Fcoding_system_p (coding_system)))
8245     return coding_system;
8246   xsignal1 (Qcoding_system_error, coding_system);
8247 }
8248
8249 \f
8250 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8251    HIGHEST is nonzero, return the coding system of the highest
8252    priority among the detected coding systems.  Otherwize return a
8253    list of detected coding systems sorted by their priorities.  If
8254    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8255    multibyte form but contains only ASCII and eight-bit chars.
8256    Otherwise, the bytes are raw bytes.
8257
8258    CODING-SYSTEM controls the detection as below:
8259
8260    If it is nil, detect both text-format and eol-format.  If the
8261    text-format part of CODING-SYSTEM is already specified
8262    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8263    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8264    detect only text-format.  */
8265
8266 Lisp_Object
8267 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8268                       coding_system)
8269      const unsigned char *src;
8270      EMACS_INT src_chars, src_bytes;
8271      int highest;
8272      int multibytep;
8273      Lisp_Object coding_system;
8274 {
8275   const unsigned char *src_end = src + src_bytes;
8276   Lisp_Object attrs, eol_type;
8277   Lisp_Object val = Qnil;
8278   struct coding_system coding;
8279   int id;
8280   struct coding_detection_info detect_info;
8281   enum coding_category base_category;
8282   int null_byte_found = 0, eight_bit_found = 0;
8283
8284   if (NILP (coding_system))
8285     coding_system = Qundecided;
8286   setup_coding_system (coding_system, &coding);
8287   attrs = CODING_ID_ATTRS (coding.id);
8288   eol_type = CODING_ID_EOL_TYPE (coding.id);
8289   coding_system = CODING_ATTR_BASE_NAME (attrs);
8290
8291   coding.source = src;
8292   coding.src_chars = src_chars;
8293   coding.src_bytes = src_bytes;
8294   coding.src_multibyte = multibytep;
8295   coding.consumed = 0;
8296   coding.mode |= CODING_MODE_LAST_BLOCK;
8297   coding.head_ascii = 0;
8298
8299   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8300
8301   /* At first, detect text-format if necessary.  */
8302   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8303   if (base_category == coding_category_undecided)
8304     {
8305       enum coding_category category;
8306       struct coding_system *this;
8307       int c, i;
8308
8309       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8310       for (; src < src_end; src++)
8311         {
8312           c = *src;
8313           if (c & 0x80)
8314             {
8315               eight_bit_found = 1;
8316               if (null_byte_found)
8317                 break;
8318             }
8319           else if (c < 0x20)
8320             {
8321               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8322                   && ! inhibit_iso_escape_detection
8323                   && ! detect_info.checked)
8324                 {
8325                   if (detect_coding_iso_2022 (&coding, &detect_info))
8326                     {
8327                       /* We have scanned the whole data.  */
8328                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8329                         {
8330                           /* We didn't find an 8-bit code.  We may
8331                              have found a null-byte, but it's very
8332                              rare that a binary file confirm to
8333                              ISO-2022.  */
8334                           src = src_end;
8335                           coding.head_ascii = src - coding.source;
8336                         }
8337                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8338                       break;
8339                     }
8340                 }
8341               else if (! c && !inhibit_null_byte_detection)
8342                 {
8343                   null_byte_found = 1;
8344                   if (eight_bit_found)
8345                     break;
8346                 }
8347               if (! eight_bit_found)
8348                 coding.head_ascii++;
8349             }
8350           else if (! eight_bit_found)
8351             coding.head_ascii++;
8352         }
8353
8354       if (null_byte_found || eight_bit_found
8355           || coding.head_ascii < coding.src_bytes
8356           || detect_info.found)
8357         {
8358           if (coding.head_ascii == coding.src_bytes)
8359             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8360             for (i = 0; i < coding_category_raw_text; i++)
8361               {
8362                 category = coding_priorities[i];
8363                 this = coding_categories + category;
8364                 if (detect_info.found & (1 << category))
8365                   break;
8366               }
8367           else
8368             {
8369               if (null_byte_found)
8370                 {
8371                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8372                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8373                 }
8374               for (i = 0; i < coding_category_raw_text; i++)
8375                 {
8376                   category = coding_priorities[i];
8377                   this = coding_categories + category;
8378
8379                   if (this->id < 0)
8380                     {
8381                       /* No coding system of this category is defined.  */
8382                       detect_info.rejected |= (1 << category);
8383                     }
8384                   else if (category >= coding_category_raw_text)
8385                     continue;
8386                   else if (detect_info.checked & (1 << category))
8387                     {
8388                       if (highest
8389                           && (detect_info.found & (1 << category)))
8390                         break;
8391                     }
8392                   else if ((*(this->detector)) (&coding, &detect_info)
8393                            && highest
8394                            && (detect_info.found & (1 << category)))
8395                     {
8396                       if (category == coding_category_utf_16_auto)
8397                         {
8398                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8399                             category = coding_category_utf_16_le;
8400                           else
8401                             category = coding_category_utf_16_be;
8402                         }
8403                       break;
8404                     }
8405                 }
8406             }
8407         }
8408
8409       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8410           || null_byte_found)
8411         {
8412           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8413           id = CODING_SYSTEM_ID (Qno_conversion);
8414           val = Fcons (make_number (id), Qnil);
8415         }
8416       else if (! detect_info.rejected && ! detect_info.found)
8417         {
8418           detect_info.found = CATEGORY_MASK_ANY;
8419           id = coding_categories[coding_category_undecided].id;
8420           val = Fcons (make_number (id), Qnil);
8421         }
8422       else if (highest)
8423         {
8424           if (detect_info.found)
8425             {
8426               detect_info.found = 1 << category;
8427               val = Fcons (make_number (this->id), Qnil);
8428             }
8429           else
8430             for (i = 0; i < coding_category_raw_text; i++)
8431               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8432                 {
8433                   detect_info.found = 1 << coding_priorities[i];
8434                   id = coding_categories[coding_priorities[i]].id;
8435                   val = Fcons (make_number (id), Qnil);
8436                   break;
8437                 }
8438         }
8439       else
8440         {
8441           int mask = detect_info.rejected | detect_info.found;
8442           int found = 0;
8443
8444           for (i = coding_category_raw_text - 1; i >= 0; i--)
8445             {
8446               category = coding_priorities[i];
8447               if (! (mask & (1 << category)))
8448                 {
8449                   found |= 1 << category;
8450                   id = coding_categories[category].id;
8451                   if (id >= 0)
8452                     val = Fcons (make_number (id), val);
8453                 }
8454             }
8455           for (i = coding_category_raw_text - 1; i >= 0; i--)
8456             {
8457               category = coding_priorities[i];
8458               if (detect_info.found & (1 << category))
8459                 {
8460                   id = coding_categories[category].id;
8461                   val = Fcons (make_number (id), val);
8462                 }
8463             }
8464           detect_info.found |= found;
8465         }
8466     }
8467   else if (base_category == coding_category_utf_8_auto)
8468     {
8469       if (detect_coding_utf_8 (&coding, &detect_info))
8470         {
8471           struct coding_system *this;
8472
8473           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8474             this = coding_categories + coding_category_utf_8_sig;
8475           else
8476             this = coding_categories + coding_category_utf_8_nosig;
8477           val = Fcons (make_number (this->id), Qnil);
8478         }
8479     }
8480   else if (base_category == coding_category_utf_16_auto)
8481     {
8482       if (detect_coding_utf_16 (&coding, &detect_info))
8483         {
8484           struct coding_system *this;
8485
8486           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8487             this = coding_categories + coding_category_utf_16_le;
8488           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8489             this = coding_categories + coding_category_utf_16_be;
8490           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8491             this = coding_categories + coding_category_utf_16_be_nosig;
8492           else
8493             this = coding_categories + coding_category_utf_16_le_nosig;
8494           val = Fcons (make_number (this->id), Qnil);
8495         }
8496     }
8497   else
8498     {
8499       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8500       val = Fcons (make_number (coding.id), Qnil);
8501     }
8502
8503   /* Then, detect eol-format if necessary.  */
8504   {
8505     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8506     Lisp_Object tail;
8507
8508     if (VECTORP (eol_type))
8509       {
8510         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8511           {
8512             if (null_byte_found)
8513               normal_eol = EOL_SEEN_LF;
8514             else
8515               normal_eol = detect_eol (coding.source, src_bytes,
8516                                        coding_category_raw_text);
8517           }
8518         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8519                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8520           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8521                                       coding_category_utf_16_be);
8522         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8523                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8524           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8525                                       coding_category_utf_16_le);
8526       }
8527     else
8528       {
8529         if (EQ (eol_type, Qunix))
8530           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8531         else if (EQ (eol_type, Qdos))
8532           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8533         else
8534           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8535       }
8536
8537     for (tail = val; CONSP (tail); tail = XCDR (tail))
8538       {
8539         enum coding_category category;
8540         int this_eol;
8541
8542         id = XINT (XCAR (tail));
8543         attrs = CODING_ID_ATTRS (id);
8544         category = XINT (CODING_ATTR_CATEGORY (attrs));
8545         eol_type = CODING_ID_EOL_TYPE (id);
8546         if (VECTORP (eol_type))
8547           {
8548             if (category == coding_category_utf_16_be
8549                 || category == coding_category_utf_16_be_nosig)
8550               this_eol = utf_16_be_eol;
8551             else if (category == coding_category_utf_16_le
8552                      || category == coding_category_utf_16_le_nosig)
8553               this_eol = utf_16_le_eol;
8554             else
8555               this_eol = normal_eol;
8556
8557             if (this_eol == EOL_SEEN_LF)
8558               XSETCAR (tail, AREF (eol_type, 0));
8559             else if (this_eol == EOL_SEEN_CRLF)
8560               XSETCAR (tail, AREF (eol_type, 1));
8561             else if (this_eol == EOL_SEEN_CR)
8562               XSETCAR (tail, AREF (eol_type, 2));
8563             else
8564               XSETCAR (tail, CODING_ID_NAME (id));
8565           }
8566         else
8567           XSETCAR (tail, CODING_ID_NAME (id));
8568       }
8569   }
8570
8571   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8572 }
8573
8574
8575 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8576        2, 3, 0,
8577        doc: /* Detect coding system of the text in the region between START and END.
8578 Return a list of possible coding systems ordered by priority.
8579 The coding systems to try and their priorities follows what
8580 the function `coding-system-priority-list' (which see) returns.
8581
8582 If only ASCII characters are found (except for such ISO-2022 control
8583 characters as ESC), it returns a list of single element `undecided'
8584 or its subsidiary coding system according to a detected end-of-line
8585 format.
8586
8587 If optional argument HIGHEST is non-nil, return the coding system of
8588 highest priority.  */)
8589      (start, end, highest)
8590      Lisp_Object start, end, highest;
8591 {
8592   int from, to;
8593   int from_byte, to_byte;
8594
8595   CHECK_NUMBER_COERCE_MARKER (start);
8596   CHECK_NUMBER_COERCE_MARKER (end);
8597
8598   validate_region (&start, &end);
8599   from = XINT (start), to = XINT (end);
8600   from_byte = CHAR_TO_BYTE (from);
8601   to_byte = CHAR_TO_BYTE (to);
8602
8603   if (from < GPT && to >= GPT)
8604     move_gap_both (to, to_byte);
8605
8606   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8607                                to - from, to_byte - from_byte,
8608                                !NILP (highest),
8609                                !NILP (current_buffer
8610                                       ->enable_multibyte_characters),
8611                                Qnil);
8612 }
8613
8614 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8615        1, 2, 0,
8616        doc: /* Detect coding system of the text in STRING.
8617 Return a list of possible coding systems ordered by priority.
8618 The coding systems to try and their priorities follows what
8619 the function `coding-system-priority-list' (which see) returns.
8620
8621 If only ASCII characters are found (except for such ISO-2022 control
8622 characters as ESC), it returns a list of single element `undecided'
8623 or its subsidiary coding system according to a detected end-of-line
8624 format.
8625
8626 If optional argument HIGHEST is non-nil, return the coding system of
8627 highest priority.  */)
8628      (string, highest)
8629      Lisp_Object string, highest;
8630 {
8631   CHECK_STRING (string);
8632
8633   return detect_coding_system (SDATA (string),
8634                                SCHARS (string), SBYTES (string),
8635                                !NILP (highest), STRING_MULTIBYTE (string),
8636                                Qnil);
8637 }
8638
8639
8640 static INLINE int
8641 char_encodable_p (c, attrs)
8642      int c;
8643      Lisp_Object attrs;
8644 {
8645   Lisp_Object tail;
8646   struct charset *charset;
8647   Lisp_Object translation_table;
8648
8649   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8650   if (! NILP (translation_table))
8651     c = translate_char (translation_table, c);
8652   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8653        CONSP (tail); tail = XCDR (tail))
8654     {
8655       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8656       if (CHAR_CHARSET_P (c, charset))
8657         break;
8658     }
8659   return (! NILP (tail));
8660 }
8661
8662
8663 /* Return a list of coding systems that safely encode the text between
8664    START and END.  If EXCLUDE is non-nil, it is a list of coding
8665    systems not to check.  The returned list doesn't contain any such
8666    coding systems.  In any case, if the text contains only ASCII or is
8667    unibyte, return t.  */
8668
8669 DEFUN ("find-coding-systems-region-internal",
8670        Ffind_coding_systems_region_internal,
8671        Sfind_coding_systems_region_internal, 2, 3, 0,
8672        doc: /* Internal use only.  */)
8673      (start, end, exclude)
8674      Lisp_Object start, end, exclude;
8675 {
8676   Lisp_Object coding_attrs_list, safe_codings;
8677   EMACS_INT start_byte, end_byte;
8678   const unsigned char *p, *pbeg, *pend;
8679   int c;
8680   Lisp_Object tail, elt, work_table;
8681
8682   if (STRINGP (start))
8683     {
8684       if (!STRING_MULTIBYTE (start)
8685           || SCHARS (start) == SBYTES (start))
8686         return Qt;
8687       start_byte = 0;
8688       end_byte = SBYTES (start);
8689     }
8690   else
8691     {
8692       CHECK_NUMBER_COERCE_MARKER (start);
8693       CHECK_NUMBER_COERCE_MARKER (end);
8694       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8695         args_out_of_range (start, end);
8696       if (NILP (current_buffer->enable_multibyte_characters))
8697         return Qt;
8698       start_byte = CHAR_TO_BYTE (XINT (start));
8699       end_byte = CHAR_TO_BYTE (XINT (end));
8700       if (XINT (end) - XINT (start) == end_byte - start_byte)
8701         return Qt;
8702
8703       if (XINT (start) < GPT && XINT (end) > GPT)
8704         {
8705           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8706             move_gap_both (XINT (start), start_byte);
8707           else
8708             move_gap_both (XINT (end), end_byte);
8709         }
8710     }
8711
8712   coding_attrs_list = Qnil;
8713   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8714     if (NILP (exclude)
8715         || NILP (Fmemq (XCAR (tail), exclude)))
8716       {
8717         Lisp_Object attrs;
8718
8719         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8720         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8721             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8722           {
8723             ASET (attrs, coding_attr_trans_tbl,
8724                   get_translation_table (attrs, 1, NULL));
8725             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8726           }
8727       }
8728
8729   if (STRINGP (start))
8730     p = pbeg = SDATA (start);
8731   else
8732     p = pbeg = BYTE_POS_ADDR (start_byte);
8733   pend = p + (end_byte - start_byte);
8734
8735   while (p < pend && ASCII_BYTE_P (*p)) p++;
8736   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8737
8738   work_table = Fmake_char_table (Qnil, Qnil);
8739   while (p < pend)
8740     {
8741       if (ASCII_BYTE_P (*p))
8742         p++;
8743       else
8744         {
8745           c = STRING_CHAR_ADVANCE (p);
8746           if (!NILP (char_table_ref (work_table, c)))
8747             /* This character was already checked.  Ignore it.  */
8748             continue;
8749
8750           charset_map_loaded = 0;
8751           for (tail = coding_attrs_list; CONSP (tail);)
8752             {
8753               elt = XCAR (tail);
8754               if (NILP (elt))
8755                 tail = XCDR (tail);
8756               else if (char_encodable_p (c, elt))
8757                 tail = XCDR (tail);
8758               else if (CONSP (XCDR (tail)))
8759                 {
8760                   XSETCAR (tail, XCAR (XCDR (tail)));
8761                   XSETCDR (tail, XCDR (XCDR (tail)));
8762                 }
8763               else
8764                 {
8765                   XSETCAR (tail, Qnil);
8766                   tail = XCDR (tail);
8767                 }
8768             }
8769           if (charset_map_loaded)
8770             {
8771               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8772
8773               if (STRINGP (start))
8774                 pbeg = SDATA (start);
8775               else
8776                 pbeg = BYTE_POS_ADDR (start_byte);
8777               p = pbeg + p_offset;
8778               pend = pbeg + pend_offset;
8779             }
8780           char_table_set (work_table, c, Qt);
8781         }
8782     }
8783
8784   safe_codings = list2 (Qraw_text, Qno_conversion);
8785   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8786     if (! NILP (XCAR (tail)))
8787       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8788
8789   return safe_codings;
8790 }
8791
8792
8793 DEFUN ("unencodable-char-position", Funencodable_char_position,
8794        Sunencodable_char_position, 3, 5, 0,
8795        doc: /*
8796 Return position of first un-encodable character in a region.
8797 START and END specify the region and CODING-SYSTEM specifies the
8798 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8799
8800 If optional 4th argument COUNT is non-nil, it specifies at most how
8801 many un-encodable characters to search.  In this case, the value is a
8802 list of positions.
8803
8804 If optional 5th argument STRING is non-nil, it is a string to search
8805 for un-encodable characters.  In that case, START and END are indexes
8806 to the string.  */)
8807      (start, end, coding_system, count, string)
8808      Lisp_Object start, end, coding_system, count, string;
8809 {
8810   int n;
8811   struct coding_system coding;
8812   Lisp_Object attrs, charset_list, translation_table;
8813   Lisp_Object positions;
8814   int from, to;
8815   const unsigned char *p, *stop, *pend;
8816   int ascii_compatible;
8817
8818   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8819   attrs = CODING_ID_ATTRS (coding.id);
8820   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8821     return Qnil;
8822   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8823   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8824   translation_table = get_translation_table (attrs, 1, NULL);
8825
8826   if (NILP (string))
8827     {
8828       validate_region (&start, &end);
8829       from = XINT (start);
8830       to = XINT (end);
8831       if (NILP (current_buffer->enable_multibyte_characters)
8832           || (ascii_compatible
8833               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8834         return Qnil;
8835       p = CHAR_POS_ADDR (from);
8836       pend = CHAR_POS_ADDR (to);
8837       if (from < GPT && to >= GPT)
8838         stop = GPT_ADDR;
8839       else
8840         stop = pend;
8841     }
8842   else
8843     {
8844       CHECK_STRING (string);
8845       CHECK_NATNUM (start);
8846       CHECK_NATNUM (end);
8847       from = XINT (start);
8848       to = XINT (end);
8849       if (from > to
8850           || to > SCHARS (string))
8851         args_out_of_range_3 (string, start, end);
8852       if (! STRING_MULTIBYTE (string))
8853         return Qnil;
8854       p = SDATA (string) + string_char_to_byte (string, from);
8855       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8856       if (ascii_compatible && (to - from) == (pend - p))
8857         return Qnil;
8858     }
8859
8860   if (NILP (count))
8861     n = 1;
8862   else
8863     {
8864       CHECK_NATNUM (count);
8865       n = XINT (count);
8866     }
8867
8868   positions = Qnil;
8869   while (1)
8870     {
8871       int c;
8872
8873       if (ascii_compatible)
8874         while (p < stop && ASCII_BYTE_P (*p))
8875           p++, from++;
8876       if (p >= stop)
8877         {
8878           if (p >= pend)
8879             break;
8880           stop = pend;
8881           p = GAP_END_ADDR;
8882         }
8883
8884       c = STRING_CHAR_ADVANCE (p);
8885       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8886           && ! char_charset (translate_char (translation_table, c),
8887                              charset_list, NULL))
8888         {
8889           positions = Fcons (make_number (from), positions);
8890           n--;
8891           if (n == 0)
8892             break;
8893         }
8894
8895       from++;
8896     }
8897
8898   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8899 }
8900
8901
8902 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8903        Scheck_coding_systems_region, 3, 3, 0,
8904        doc: /* Check if the region is encodable by coding systems.
8905
8906 START and END are buffer positions specifying the region.
8907 CODING-SYSTEM-LIST is a list of coding systems to check.
8908
8909 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8910 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8911 whole region, POS0, POS1, ... are buffer positions where non-encodable
8912 characters are found.
8913
8914 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8915 value is nil.
8916
8917 START may be a string.  In that case, check if the string is
8918 encodable, and the value contains indices to the string instead of
8919 buffer positions.  END is ignored.
8920
8921 If the current buffer (or START if it is a string) is unibyte, the value
8922 is nil.  */)
8923      (start, end, coding_system_list)
8924      Lisp_Object start, end, coding_system_list;
8925 {
8926   Lisp_Object list;
8927   EMACS_INT start_byte, end_byte;
8928   int pos;
8929   const unsigned char *p, *pbeg, *pend;
8930   int c;
8931   Lisp_Object tail, elt, attrs;
8932
8933   if (STRINGP (start))
8934     {
8935       if (!STRING_MULTIBYTE (start)
8936           || SCHARS (start) == SBYTES (start))
8937         return Qnil;
8938       start_byte = 0;
8939       end_byte = SBYTES (start);
8940       pos = 0;
8941     }
8942   else
8943     {
8944       CHECK_NUMBER_COERCE_MARKER (start);
8945       CHECK_NUMBER_COERCE_MARKER (end);
8946       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8947         args_out_of_range (start, end);
8948       if (NILP (current_buffer->enable_multibyte_characters))
8949         return Qnil;
8950       start_byte = CHAR_TO_BYTE (XINT (start));
8951       end_byte = CHAR_TO_BYTE (XINT (end));
8952       if (XINT (end) - XINT (start) == end_byte - start_byte)
8953         return Qnil;
8954
8955       if (XINT (start) < GPT && XINT (end) > GPT)
8956         {
8957           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8958             move_gap_both (XINT (start), start_byte);
8959           else
8960             move_gap_both (XINT (end), end_byte);
8961         }
8962       pos = XINT (start);
8963     }
8964
8965   list = Qnil;
8966   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8967     {
8968       elt = XCAR (tail);
8969       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8970       ASET (attrs, coding_attr_trans_tbl,
8971             get_translation_table (attrs, 1, NULL));
8972       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8973     }
8974
8975   if (STRINGP (start))
8976     p = pbeg = SDATA (start);
8977   else
8978     p = pbeg = BYTE_POS_ADDR (start_byte);
8979   pend = p + (end_byte - start_byte);
8980
8981   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8982   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8983
8984   while (p < pend)
8985     {
8986       if (ASCII_BYTE_P (*p))
8987         p++;
8988       else
8989         {
8990           c = STRING_CHAR_ADVANCE (p);
8991
8992           charset_map_loaded = 0;
8993           for (tail = list; CONSP (tail); tail = XCDR (tail))
8994             {
8995               elt = XCDR (XCAR (tail));
8996               if (! char_encodable_p (c, XCAR (elt)))
8997                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8998             }
8999           if (charset_map_loaded)
9000             {
9001               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
9002
9003               if (STRINGP (start))
9004                 pbeg = SDATA (start);
9005               else
9006                 pbeg = BYTE_POS_ADDR (start_byte);
9007               p = pbeg + p_offset;
9008               pend = pbeg + pend_offset;
9009             }
9010         }
9011       pos++;
9012     }
9013
9014   tail = list;
9015   list = Qnil;
9016   for (; CONSP (tail); tail = XCDR (tail))
9017     {
9018       elt = XCAR (tail);
9019       if (CONSP (XCDR (XCDR (elt))))
9020         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9021                       list);
9022     }
9023
9024   return list;
9025 }
9026
9027
9028 Lisp_Object
9029 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9030      Lisp_Object start, end, coding_system, dst_object;
9031      int encodep, norecord;
9032 {
9033   struct coding_system coding;
9034   EMACS_INT from, from_byte, to, to_byte;
9035   Lisp_Object src_object;
9036
9037   CHECK_NUMBER_COERCE_MARKER (start);
9038   CHECK_NUMBER_COERCE_MARKER (end);
9039   if (NILP (coding_system))
9040     coding_system = Qno_conversion;
9041   else
9042     CHECK_CODING_SYSTEM (coding_system);
9043   src_object = Fcurrent_buffer ();
9044   if (NILP (dst_object))
9045     dst_object = src_object;
9046   else if (! EQ (dst_object, Qt))
9047     CHECK_BUFFER (dst_object);
9048
9049   validate_region (&start, &end);
9050   from = XFASTINT (start);
9051   from_byte = CHAR_TO_BYTE (from);
9052   to = XFASTINT (end);
9053   to_byte = CHAR_TO_BYTE (to);
9054
9055   setup_coding_system (coding_system, &coding);
9056   coding.mode |= CODING_MODE_LAST_BLOCK;
9057
9058   if (encodep)
9059     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9060                           dst_object);
9061   else
9062     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9063                           dst_object);
9064   if (! norecord)
9065     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9066
9067   return (BUFFERP (dst_object)
9068           ? make_number (coding.produced_char)
9069           : coding.dst_object);
9070 }
9071
9072
9073 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9074        3, 4, "r\nzCoding system: ",
9075        doc: /* Decode the current region from the specified coding system.
9076 When called from a program, takes four arguments:
9077         START, END, CODING-SYSTEM, and DESTINATION.
9078 START and END are buffer positions.
9079
9080 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9081 If nil, the region between START and END is replaced by the decoded text.
9082 If buffer, the decoded text is inserted in that buffer after point (point
9083 does not move).
9084 In those cases, the length of the decoded text is returned.
9085 If DESTINATION is t, the decoded text is returned.
9086
9087 This function sets `last-coding-system-used' to the precise coding system
9088 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9089 not fully specified.)  */)
9090      (start, end, coding_system, destination)
9091      Lisp_Object start, end, coding_system, destination;
9092 {
9093   return code_convert_region (start, end, coding_system, destination, 0, 0);
9094 }
9095
9096 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9097        3, 4, "r\nzCoding system: ",
9098        doc: /* Encode the current region by specified coding system.
9099 When called from a program, takes four arguments:
9100         START, END, CODING-SYSTEM and DESTINATION.
9101 START and END are buffer positions.
9102
9103 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9104 If nil, the region between START and END is replace by the encoded text.
9105 If buffer, the encoded text is inserted in that buffer after point (point
9106 does not move).
9107 In those cases, the length of the encoded text is returned.
9108 If DESTINATION is t, the encoded text is returned.
9109
9110 This function sets `last-coding-system-used' to the precise coding system
9111 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9112 not fully specified.)  */)
9113   (start, end, coding_system, destination)
9114      Lisp_Object start, end, coding_system, destination;
9115 {
9116   return code_convert_region (start, end, coding_system, destination, 1, 0);
9117 }
9118
9119 Lisp_Object
9120 code_convert_string (string, coding_system, dst_object,
9121                      encodep, nocopy, norecord)
9122      Lisp_Object string, coding_system, dst_object;
9123      int encodep, nocopy, norecord;
9124 {
9125   struct coding_system coding;
9126   EMACS_INT chars, bytes;
9127
9128   CHECK_STRING (string);
9129   if (NILP (coding_system))
9130     {
9131       if (! norecord)
9132         Vlast_coding_system_used = Qno_conversion;
9133       if (NILP (dst_object))
9134         return (nocopy ? Fcopy_sequence (string) : string);
9135     }
9136
9137   if (NILP (coding_system))
9138     coding_system = Qno_conversion;
9139   else
9140     CHECK_CODING_SYSTEM (coding_system);
9141   if (NILP (dst_object))
9142     dst_object = Qt;
9143   else if (! EQ (dst_object, Qt))
9144     CHECK_BUFFER (dst_object);
9145
9146   setup_coding_system (coding_system, &coding);
9147   coding.mode |= CODING_MODE_LAST_BLOCK;
9148   chars = SCHARS (string);
9149   bytes = SBYTES (string);
9150   if (encodep)
9151     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9152   else
9153     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9154   if (! norecord)
9155     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9156
9157   return (BUFFERP (dst_object)
9158           ? make_number (coding.produced_char)
9159           : coding.dst_object);
9160 }
9161
9162
9163 /* Encode or decode STRING according to CODING_SYSTEM.
9164    Do not set Vlast_coding_system_used.
9165
9166    This function is called only from macros DECODE_FILE and
9167    ENCODE_FILE, thus we ignore character composition.  */
9168
9169 Lisp_Object
9170 code_convert_string_norecord (string, coding_system, encodep)
9171      Lisp_Object string, coding_system;
9172      int encodep;
9173 {
9174   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9175 }
9176
9177
9178 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9179        2, 4, 0,
9180        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9181
9182 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9183 if the decoding operation is trivial.
9184
9185 Optional fourth arg BUFFER non-nil means that the decoded text is
9186 inserted in that buffer after point (point does not move).  In this
9187 case, the return value is the length of the decoded text.
9188
9189 This function sets `last-coding-system-used' to the precise coding system
9190 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9191 not fully specified.)  */)
9192   (string, coding_system, nocopy, buffer)
9193      Lisp_Object string, coding_system, nocopy, buffer;
9194 {
9195   return code_convert_string (string, coding_system, buffer,
9196                               0, ! NILP (nocopy), 0);
9197 }
9198
9199 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9200        2, 4, 0,
9201        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9202
9203 Optional third arg NOCOPY non-nil means it is OK to return STRING
9204 itself if the encoding operation is trivial.
9205
9206 Optional fourth arg BUFFER non-nil means that the encoded text is
9207 inserted in that buffer after point (point does not move).  In this
9208 case, the return value is the length of the encoded text.
9209
9210 This function sets `last-coding-system-used' to the precise coding system
9211 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9212 not fully specified.)  */)
9213      (string, coding_system, nocopy, buffer)
9214      Lisp_Object string, coding_system, nocopy, buffer;
9215 {
9216   return code_convert_string (string, coding_system, buffer,
9217                               1, ! NILP (nocopy), 1);
9218 }
9219
9220 \f
9221 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9222        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9223 Return the corresponding character.  */)
9224      (code)
9225      Lisp_Object code;
9226 {
9227   Lisp_Object spec, attrs, val;
9228   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9229   int c;
9230
9231   CHECK_NATNUM (code);
9232   c = XFASTINT (code);
9233   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9234   attrs = AREF (spec, 0);
9235
9236   if (ASCII_BYTE_P (c)
9237       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9238     return code;
9239
9240   val = CODING_ATTR_CHARSET_LIST (attrs);
9241   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9242   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9243   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9244
9245   if (c <= 0x7F)
9246     charset = charset_roman;
9247   else if (c >= 0xA0 && c < 0xDF)
9248     {
9249       charset = charset_kana;
9250       c -= 0x80;
9251     }
9252   else
9253     {
9254       int s1 = c >> 8, s2 = c & 0xFF;
9255
9256       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9257           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9258         error ("Invalid code: %d", code);
9259       SJIS_TO_JIS (c);
9260       charset = charset_kanji;
9261     }
9262   c = DECODE_CHAR (charset, c);
9263   if (c < 0)
9264     error ("Invalid code: %d", code);
9265   return make_number (c);
9266 }
9267
9268
9269 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9270        doc: /* Encode a Japanese character CH to shift_jis encoding.
9271 Return the corresponding code in SJIS.  */)
9272      (ch)
9273     Lisp_Object ch;
9274 {
9275   Lisp_Object spec, attrs, charset_list;
9276   int c;
9277   struct charset *charset;
9278   unsigned code;
9279
9280   CHECK_CHARACTER (ch);
9281   c = XFASTINT (ch);
9282   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9283   attrs = AREF (spec, 0);
9284
9285   if (ASCII_CHAR_P (c)
9286       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9287     return ch;
9288
9289   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9290   charset = char_charset (c, charset_list, &code);
9291   if (code == CHARSET_INVALID_CODE (charset))
9292     error ("Can't encode by shift_jis encoding: %d", c);
9293   JIS_TO_SJIS (code);
9294
9295   return make_number (code);
9296 }
9297
9298 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9299        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9300 Return the corresponding character.  */)
9301      (code)
9302      Lisp_Object code;
9303 {
9304   Lisp_Object spec, attrs, val;
9305   struct charset *charset_roman, *charset_big5, *charset;
9306   int c;
9307
9308   CHECK_NATNUM (code);
9309   c = XFASTINT (code);
9310   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9311   attrs = AREF (spec, 0);
9312
9313   if (ASCII_BYTE_P (c)
9314       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9315     return code;
9316
9317   val = CODING_ATTR_CHARSET_LIST (attrs);
9318   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9319   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9320
9321   if (c <= 0x7F)
9322     charset = charset_roman;
9323   else
9324     {
9325       int b1 = c >> 8, b2 = c & 0x7F;
9326       if (b1 < 0xA1 || b1 > 0xFE
9327           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9328         error ("Invalid code: %d", code);
9329       charset = charset_big5;
9330     }
9331   c = DECODE_CHAR (charset, (unsigned )c);
9332   if (c < 0)
9333     error ("Invalid code: %d", code);
9334   return make_number (c);
9335 }
9336
9337 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9338        doc: /* Encode the Big5 character CH to BIG5 coding system.
9339 Return the corresponding character code in Big5.  */)
9340      (ch)
9341      Lisp_Object ch;
9342 {
9343   Lisp_Object spec, attrs, charset_list;
9344   struct charset *charset;
9345   int c;
9346   unsigned code;
9347
9348   CHECK_CHARACTER (ch);
9349   c = XFASTINT (ch);
9350   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9351   attrs = AREF (spec, 0);
9352   if (ASCII_CHAR_P (c)
9353       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9354     return ch;
9355
9356   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9357   charset = char_charset (c, charset_list, &code);
9358   if (code == CHARSET_INVALID_CODE (charset))
9359     error ("Can't encode by Big5 encoding: %d", c);
9360
9361   return make_number (code);
9362 }
9363
9364 \f
9365 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9366        Sset_terminal_coding_system_internal, 1, 2, 0,
9367        doc: /* Internal use only.  */)
9368      (coding_system, terminal)
9369      Lisp_Object coding_system;
9370      Lisp_Object terminal;
9371 {
9372   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9373   CHECK_SYMBOL (coding_system);
9374   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9375   /* We had better not send unsafe characters to terminal.  */
9376   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9377   /* Characer composition should be disabled.  */
9378   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9379   terminal_coding->src_multibyte = 1;
9380   terminal_coding->dst_multibyte = 0;
9381   return Qnil;
9382 }
9383
9384 DEFUN ("set-safe-terminal-coding-system-internal",
9385        Fset_safe_terminal_coding_system_internal,
9386        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9387        doc: /* Internal use only.  */)
9388      (coding_system)
9389      Lisp_Object coding_system;
9390 {
9391   CHECK_SYMBOL (coding_system);
9392   setup_coding_system (Fcheck_coding_system (coding_system),
9393                        &safe_terminal_coding);
9394   /* Characer composition should be disabled.  */
9395   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9396   safe_terminal_coding.src_multibyte = 1;
9397   safe_terminal_coding.dst_multibyte = 0;
9398   return Qnil;
9399 }
9400
9401 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9402        Sterminal_coding_system, 0, 1, 0,
9403        doc: /* Return coding system specified for terminal output on the given terminal.
9404 TERMINAL may be a terminal object, a frame, or nil for the selected
9405 frame's terminal device.  */)
9406      (terminal)
9407      Lisp_Object terminal;
9408 {
9409   struct coding_system *terminal_coding
9410     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9411   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9412
9413   /* For backward compatibility, return nil if it is `undecided'. */
9414   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9415 }
9416
9417 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9418        Sset_keyboard_coding_system_internal, 1, 2, 0,
9419        doc: /* Internal use only.  */)
9420      (coding_system, terminal)
9421      Lisp_Object coding_system;
9422      Lisp_Object terminal;
9423 {
9424   struct terminal *t = get_terminal (terminal, 1);
9425   CHECK_SYMBOL (coding_system);
9426   if (NILP (coding_system))
9427     coding_system = Qno_conversion;
9428   else
9429     Fcheck_coding_system (coding_system);
9430   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9431   /* Characer composition should be disabled.  */
9432   TERMINAL_KEYBOARD_CODING (t)->common_flags
9433     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9434   return Qnil;
9435 }
9436
9437 DEFUN ("keyboard-coding-system",
9438        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9439        doc: /* Return coding system specified for decoding keyboard input.  */)
9440      (terminal)
9441      Lisp_Object terminal;
9442 {
9443   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9444                          (get_terminal (terminal, 1))->id);
9445 }
9446
9447 \f
9448 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9449        Sfind_operation_coding_system,  1, MANY, 0,
9450        doc: /* Choose a coding system for an operation based on the target name.
9451 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9452 DECODING-SYSTEM is the coding system to use for decoding
9453 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9454 for encoding (in case OPERATION does encoding).
9455
9456 The first argument OPERATION specifies an I/O primitive:
9457   For file I/O, `insert-file-contents' or `write-region'.
9458   For process I/O, `call-process', `call-process-region', or `start-process'.
9459   For network I/O, `open-network-stream'.
9460
9461 The remaining arguments should be the same arguments that were passed
9462 to the primitive.  Depending on which primitive, one of those arguments
9463 is selected as the TARGET.  For example, if OPERATION does file I/O,
9464 whichever argument specifies the file name is TARGET.
9465
9466 TARGET has a meaning which depends on OPERATION:
9467   For file I/O, TARGET is a file name (except for the special case below).
9468   For process I/O, TARGET is a process name.
9469   For network I/O, TARGET is a service name or a port number.
9470
9471 This function looks up what is specified for TARGET in
9472 `file-coding-system-alist', `process-coding-system-alist',
9473 or `network-coding-system-alist' depending on OPERATION.
9474 They may specify a coding system, a cons of coding systems,
9475 or a function symbol to call.
9476 In the last case, we call the function with one argument,
9477 which is a list of all the arguments given to this function.
9478 If the function can't decide a coding system, it can return
9479 `undecided' so that the normal code-detection is performed.
9480
9481 If OPERATION is `insert-file-contents', the argument corresponding to
9482 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9483 file name to look up, and BUFFER is a buffer that contains the file's
9484 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9485 function to call for FILENAME, that function should examine the
9486 contents of BUFFER instead of reading the file.
9487
9488 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9489      (nargs, args)
9490      int nargs;
9491      Lisp_Object *args;
9492 {
9493   Lisp_Object operation, target_idx, target, val;
9494   register Lisp_Object chain;
9495
9496   if (nargs < 2)
9497     error ("Too few arguments");
9498   operation = args[0];
9499   if (!SYMBOLP (operation)
9500       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9501     error ("Invalid first argument");
9502   if (nargs < 1 + XINT (target_idx))
9503     error ("Too few arguments for operation: %s",
9504            SDATA (SYMBOL_NAME (operation)));
9505   target = args[XINT (target_idx) + 1];
9506   if (!(STRINGP (target)
9507         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9508             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9509         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9510     error ("Invalid %dth argument", XINT (target_idx) + 1);
9511   if (CONSP (target))
9512     target = XCAR (target);
9513
9514   chain = ((EQ (operation, Qinsert_file_contents)
9515             || EQ (operation, Qwrite_region))
9516            ? Vfile_coding_system_alist
9517            : (EQ (operation, Qopen_network_stream)
9518               ? Vnetwork_coding_system_alist
9519               : Vprocess_coding_system_alist));
9520   if (NILP (chain))
9521     return Qnil;
9522
9523   for (; CONSP (chain); chain = XCDR (chain))
9524     {
9525       Lisp_Object elt;
9526
9527       elt = XCAR (chain);
9528       if (CONSP (elt)
9529           && ((STRINGP (target)
9530                && STRINGP (XCAR (elt))
9531                && fast_string_match (XCAR (elt), target) >= 0)
9532               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9533         {
9534           val = XCDR (elt);
9535           /* Here, if VAL is both a valid coding system and a valid
9536              function symbol, we return VAL as a coding system.  */
9537           if (CONSP (val))
9538             return val;
9539           if (! SYMBOLP (val))
9540             return Qnil;
9541           if (! NILP (Fcoding_system_p (val)))
9542             return Fcons (val, val);
9543           if (! NILP (Ffboundp (val)))
9544             {
9545               /* We use call1 rather than safe_call1
9546                  so as to get bug reports about functions called here
9547                  which don't handle the current interface.  */
9548               val = call1 (val, Flist (nargs, args));
9549               if (CONSP (val))
9550                 return val;
9551               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9552                 return Fcons (val, val);
9553             }
9554           return Qnil;
9555         }
9556     }
9557   return Qnil;
9558 }
9559
9560 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9561        Sset_coding_system_priority, 0, MANY, 0,
9562        doc: /* Assign higher priority to the coding systems given as arguments.
9563 If multiple coding systems belong to the same category,
9564 all but the first one are ignored.
9565
9566 usage: (set-coding-system-priority &rest coding-systems)  */)
9567      (nargs, args)
9568      int nargs;
9569      Lisp_Object *args;
9570 {
9571   int i, j;
9572   int changed[coding_category_max];
9573   enum coding_category priorities[coding_category_max];
9574
9575   bzero (changed, sizeof changed);
9576
9577   for (i = j = 0; i < nargs; i++)
9578     {
9579       enum coding_category category;
9580       Lisp_Object spec, attrs;
9581
9582       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9583       attrs = AREF (spec, 0);
9584       category = XINT (CODING_ATTR_CATEGORY (attrs));
9585       if (changed[category])
9586         /* Ignore this coding system because a coding system of the
9587            same category already had a higher priority.  */
9588         continue;
9589       changed[category] = 1;
9590       priorities[j++] = category;
9591       if (coding_categories[category].id >= 0
9592           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9593         setup_coding_system (args[i], &coding_categories[category]);
9594       Fset (AREF (Vcoding_category_table, category), args[i]);
9595     }
9596
9597   /* Now we have decided top J priorities.  Reflect the order of the
9598      original priorities to the remaining priorities.  */
9599
9600   for (i = j, j = 0; i < coding_category_max; i++, j++)
9601     {
9602       while (j < coding_category_max
9603              && changed[coding_priorities[j]])
9604         j++;
9605       if (j == coding_category_max)
9606         abort ();
9607       priorities[i] = coding_priorities[j];
9608     }
9609
9610   bcopy (priorities, coding_priorities, sizeof priorities);
9611
9612   /* Update `coding-category-list'.  */
9613   Vcoding_category_list = Qnil;
9614   for (i = coding_category_max - 1; i >= 0; i--)
9615     Vcoding_category_list
9616       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9617                Vcoding_category_list);
9618
9619   return Qnil;
9620 }
9621
9622 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9623        Scoding_system_priority_list, 0, 1, 0,
9624        doc: /* Return a list of coding systems ordered by their priorities.
9625 The list contains a subset of coding systems; i.e. coding systems
9626 assigned to each coding category (see `coding-category-list').
9627
9628 HIGHESTP non-nil means just return the highest priority one.  */)
9629      (highestp)
9630      Lisp_Object highestp;
9631 {
9632   int i;
9633   Lisp_Object val;
9634
9635   for (i = 0, val = Qnil; i < coding_category_max; i++)
9636     {
9637       enum coding_category category = coding_priorities[i];
9638       int id = coding_categories[category].id;
9639       Lisp_Object attrs;
9640
9641       if (id < 0)
9642         continue;
9643       attrs = CODING_ID_ATTRS (id);
9644       if (! NILP (highestp))
9645         return CODING_ATTR_BASE_NAME (attrs);
9646       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9647     }
9648   return Fnreverse (val);
9649 }
9650
9651 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9652
9653 static Lisp_Object
9654 make_subsidiaries (base)
9655      Lisp_Object base;
9656 {
9657   Lisp_Object subsidiaries;
9658   int base_name_len = SBYTES (SYMBOL_NAME (base));
9659   char *buf = (char *) alloca (base_name_len + 6);
9660   int i;
9661
9662   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9663   subsidiaries = Fmake_vector (make_number (3), Qnil);
9664   for (i = 0; i < 3; i++)
9665     {
9666       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9667       ASET (subsidiaries, i, intern (buf));
9668     }
9669   return subsidiaries;
9670 }
9671
9672
9673 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9674        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9675        doc: /* For internal use only.
9676 usage: (define-coding-system-internal ...)  */)
9677      (nargs, args)
9678      int nargs;
9679      Lisp_Object *args;
9680 {
9681   Lisp_Object name;
9682   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9683   Lisp_Object attrs;            /* Vector of attributes.  */
9684   Lisp_Object eol_type;
9685   Lisp_Object aliases;
9686   Lisp_Object coding_type, charset_list, safe_charsets;
9687   enum coding_category category;
9688   Lisp_Object tail, val;
9689   int max_charset_id = 0;
9690   int i;
9691
9692   if (nargs < coding_arg_max)
9693     goto short_args;
9694
9695   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9696
9697   name = args[coding_arg_name];
9698   CHECK_SYMBOL (name);
9699   CODING_ATTR_BASE_NAME (attrs) = name;
9700
9701   val = args[coding_arg_mnemonic];
9702   if (! STRINGP (val))
9703     CHECK_CHARACTER (val);
9704   CODING_ATTR_MNEMONIC (attrs) = val;
9705
9706   coding_type = args[coding_arg_coding_type];
9707   CHECK_SYMBOL (coding_type);
9708   CODING_ATTR_TYPE (attrs) = coding_type;
9709
9710   charset_list = args[coding_arg_charset_list];
9711   if (SYMBOLP (charset_list))
9712     {
9713       if (EQ (charset_list, Qiso_2022))
9714         {
9715           if (! EQ (coding_type, Qiso_2022))
9716             error ("Invalid charset-list");
9717           charset_list = Viso_2022_charset_list;
9718         }
9719       else if (EQ (charset_list, Qemacs_mule))
9720         {
9721           if (! EQ (coding_type, Qemacs_mule))
9722             error ("Invalid charset-list");
9723           charset_list = Vemacs_mule_charset_list;
9724         }
9725       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9726         if (max_charset_id < XFASTINT (XCAR (tail)))
9727           max_charset_id = XFASTINT (XCAR (tail));
9728     }
9729   else
9730     {
9731       charset_list = Fcopy_sequence (charset_list);
9732       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9733         {
9734           struct charset *charset;
9735
9736           val = XCAR (tail);
9737           CHECK_CHARSET_GET_CHARSET (val, charset);
9738           if (EQ (coding_type, Qiso_2022)
9739               ? CHARSET_ISO_FINAL (charset) < 0
9740               : EQ (coding_type, Qemacs_mule)
9741               ? CHARSET_EMACS_MULE_ID (charset) < 0
9742               : 0)
9743             error ("Can't handle charset `%s'",
9744                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9745
9746           XSETCAR (tail, make_number (charset->id));
9747           if (max_charset_id < charset->id)
9748             max_charset_id = charset->id;
9749         }
9750     }
9751   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9752
9753   safe_charsets = make_uninit_string (max_charset_id + 1);
9754   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9755   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9756     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9757   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9758
9759   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9760
9761   val = args[coding_arg_decode_translation_table];
9762   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9763     CHECK_SYMBOL (val);
9764   CODING_ATTR_DECODE_TBL (attrs) = val;
9765
9766   val = args[coding_arg_encode_translation_table];
9767   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9768     CHECK_SYMBOL (val);
9769   CODING_ATTR_ENCODE_TBL (attrs) = val;
9770
9771   val = args[coding_arg_post_read_conversion];
9772   CHECK_SYMBOL (val);
9773   CODING_ATTR_POST_READ (attrs) = val;
9774
9775   val = args[coding_arg_pre_write_conversion];
9776   CHECK_SYMBOL (val);
9777   CODING_ATTR_PRE_WRITE (attrs) = val;
9778
9779   val = args[coding_arg_default_char];
9780   if (NILP (val))
9781     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9782   else
9783     {
9784       CHECK_CHARACTER (val);
9785       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9786     }
9787
9788   val = args[coding_arg_for_unibyte];
9789   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9790
9791   val = args[coding_arg_plist];
9792   CHECK_LIST (val);
9793   CODING_ATTR_PLIST (attrs) = val;
9794
9795   if (EQ (coding_type, Qcharset))
9796     {
9797       /* Generate a lisp vector of 256 elements.  Each element is nil,
9798          integer, or a list of charset IDs.
9799
9800          If Nth element is nil, the byte code N is invalid in this
9801          coding system.
9802
9803          If Nth element is a number NUM, N is the first byte of a
9804          charset whose ID is NUM.
9805
9806          If Nth element is a list of charset IDs, N is the first byte
9807          of one of them.  The list is sorted by dimensions of the
9808          charsets.  A charset of smaller dimension comes firtst. */
9809       val = Fmake_vector (make_number (256), Qnil);
9810
9811       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9812         {
9813           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9814           int dim = CHARSET_DIMENSION (charset);
9815           int idx = (dim - 1) * 4;
9816
9817           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9818             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9819
9820           for (i = charset->code_space[idx];
9821                i <= charset->code_space[idx + 1]; i++)
9822             {
9823               Lisp_Object tmp, tmp2;
9824               int dim2;
9825
9826               tmp = AREF (val, i);
9827               if (NILP (tmp))
9828                 tmp = XCAR (tail);
9829               else if (NUMBERP (tmp))
9830                 {
9831                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9832                   if (dim < dim2)
9833                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9834                   else
9835                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9836                 }
9837               else
9838                 {
9839                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9840                     {
9841                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9842                       if (dim < dim2)
9843                         break;
9844                     }
9845                   if (NILP (tmp2))
9846                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9847                   else
9848                     {
9849                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9850                       XSETCAR (tmp2, XCAR (tail));
9851                     }
9852                 }
9853               ASET (val, i, tmp);
9854             }
9855         }
9856       ASET (attrs, coding_attr_charset_valids, val);
9857       category = coding_category_charset;
9858     }
9859   else if (EQ (coding_type, Qccl))
9860     {
9861       Lisp_Object valids;
9862
9863       if (nargs < coding_arg_ccl_max)
9864         goto short_args;
9865
9866       val = args[coding_arg_ccl_decoder];
9867       CHECK_CCL_PROGRAM (val);
9868       if (VECTORP (val))
9869         val = Fcopy_sequence (val);
9870       ASET (attrs, coding_attr_ccl_decoder, val);
9871
9872       val = args[coding_arg_ccl_encoder];
9873       CHECK_CCL_PROGRAM (val);
9874       if (VECTORP (val))
9875         val = Fcopy_sequence (val);
9876       ASET (attrs, coding_attr_ccl_encoder, val);
9877
9878       val = args[coding_arg_ccl_valids];
9879       valids = Fmake_string (make_number (256), make_number (0));
9880       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9881         {
9882           int from, to;
9883
9884           val = Fcar (tail);
9885           if (INTEGERP (val))
9886             {
9887               from = to = XINT (val);
9888               if (from < 0 || from > 255)
9889                 args_out_of_range_3 (val, make_number (0), make_number (255));
9890             }
9891           else
9892             {
9893               CHECK_CONS (val);
9894               CHECK_NATNUM_CAR (val);
9895               CHECK_NATNUM_CDR (val);
9896               from = XINT (XCAR (val));
9897               if (from > 255)
9898                 args_out_of_range_3 (XCAR (val),
9899                                      make_number (0), make_number (255));
9900               to = XINT (XCDR (val));
9901               if (to < from || to > 255)
9902                 args_out_of_range_3 (XCDR (val),
9903                                      XCAR (val), make_number (255));
9904             }
9905           for (i = from; i <= to; i++)
9906             SSET (valids, i, 1);
9907         }
9908       ASET (attrs, coding_attr_ccl_valids, valids);
9909
9910       category = coding_category_ccl;
9911     }
9912   else if (EQ (coding_type, Qutf_16))
9913     {
9914       Lisp_Object bom, endian;
9915
9916       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9917
9918       if (nargs < coding_arg_utf16_max)
9919         goto short_args;
9920
9921       bom = args[coding_arg_utf16_bom];
9922       if (! NILP (bom) && ! EQ (bom, Qt))
9923         {
9924           CHECK_CONS (bom);
9925           val = XCAR (bom);
9926           CHECK_CODING_SYSTEM (val);
9927           val = XCDR (bom);
9928           CHECK_CODING_SYSTEM (val);
9929         }
9930       ASET (attrs, coding_attr_utf_bom, bom);
9931
9932       endian = args[coding_arg_utf16_endian];
9933       CHECK_SYMBOL (endian);
9934       if (NILP (endian))
9935         endian = Qbig;
9936       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9937         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9938       ASET (attrs, coding_attr_utf_16_endian, endian);
9939
9940       category = (CONSP (bom)
9941                   ? coding_category_utf_16_auto
9942                   : NILP (bom)
9943                   ? (EQ (endian, Qbig)
9944                      ? coding_category_utf_16_be_nosig
9945                      : coding_category_utf_16_le_nosig)
9946                   : (EQ (endian, Qbig)
9947                      ? coding_category_utf_16_be
9948                      : coding_category_utf_16_le));
9949     }
9950   else if (EQ (coding_type, Qiso_2022))
9951     {
9952       Lisp_Object initial, reg_usage, request, flags;
9953       int i;
9954
9955       if (nargs < coding_arg_iso2022_max)
9956         goto short_args;
9957
9958       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9959       CHECK_VECTOR (initial);
9960       for (i = 0; i < 4; i++)
9961         {
9962           val = Faref (initial, make_number (i));
9963           if (! NILP (val))
9964             {
9965               struct charset *charset;
9966
9967               CHECK_CHARSET_GET_CHARSET (val, charset);
9968               ASET (initial, i, make_number (CHARSET_ID (charset)));
9969               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9970                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9971             }
9972           else
9973             ASET (initial, i, make_number (-1));
9974         }
9975
9976       reg_usage = args[coding_arg_iso2022_reg_usage];
9977       CHECK_CONS (reg_usage);
9978       CHECK_NUMBER_CAR (reg_usage);
9979       CHECK_NUMBER_CDR (reg_usage);
9980
9981       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9982       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9983         {
9984           int id;
9985           Lisp_Object tmp;
9986
9987           val = Fcar (tail);
9988           CHECK_CONS (val);
9989           tmp = XCAR (val);
9990           CHECK_CHARSET_GET_ID (tmp, id);
9991           CHECK_NATNUM_CDR (val);
9992           if (XINT (XCDR (val)) >= 4)
9993             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9994           XSETCAR (val, make_number (id));
9995         }
9996
9997       flags = args[coding_arg_iso2022_flags];
9998       CHECK_NATNUM (flags);
9999       i = XINT (flags);
10000       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10001         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
10002
10003       ASET (attrs, coding_attr_iso_initial, initial);
10004       ASET (attrs, coding_attr_iso_usage, reg_usage);
10005       ASET (attrs, coding_attr_iso_request, request);
10006       ASET (attrs, coding_attr_iso_flags, flags);
10007       setup_iso_safe_charsets (attrs);
10008
10009       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10010         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10011                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10012                     ? coding_category_iso_7_else
10013                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10014                     ? coding_category_iso_7
10015                     : coding_category_iso_7_tight);
10016       else
10017         {
10018           int id = XINT (AREF (initial, 1));
10019
10020           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10021                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10022                        || id < 0)
10023                       ? coding_category_iso_8_else
10024                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10025                       ? coding_category_iso_8_1
10026                       : coding_category_iso_8_2);
10027         }
10028       if (category != coding_category_iso_8_1
10029           && category != coding_category_iso_8_2)
10030         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10031     }
10032   else if (EQ (coding_type, Qemacs_mule))
10033     {
10034       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10035         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10036       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10037       category = coding_category_emacs_mule;
10038     }
10039   else if (EQ (coding_type, Qshift_jis))
10040     {
10041
10042       struct charset *charset;
10043
10044       if (XINT (Flength (charset_list)) != 3
10045           && XINT (Flength (charset_list)) != 4)
10046         error ("There should be three or four charsets");
10047
10048       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10049       if (CHARSET_DIMENSION (charset) != 1)
10050         error ("Dimension of charset %s is not one",
10051                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10052       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10053         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10054
10055       charset_list = XCDR (charset_list);
10056       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10057       if (CHARSET_DIMENSION (charset) != 1)
10058         error ("Dimension of charset %s is not one",
10059                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10060
10061       charset_list = XCDR (charset_list);
10062       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10063       if (CHARSET_DIMENSION (charset) != 2)
10064         error ("Dimension of charset %s is not two",
10065                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10066
10067       charset_list = XCDR (charset_list);
10068       if (! NILP (charset_list))
10069         {
10070           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10071           if (CHARSET_DIMENSION (charset) != 2)
10072             error ("Dimension of charset %s is not two",
10073                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10074         }
10075
10076       category = coding_category_sjis;
10077       Vsjis_coding_system = name;
10078     }
10079   else if (EQ (coding_type, Qbig5))
10080     {
10081       struct charset *charset;
10082
10083       if (XINT (Flength (charset_list)) != 2)
10084         error ("There should be just two charsets");
10085
10086       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10087       if (CHARSET_DIMENSION (charset) != 1)
10088         error ("Dimension of charset %s is not one",
10089                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10090       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10091         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10092
10093       charset_list = XCDR (charset_list);
10094       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10095       if (CHARSET_DIMENSION (charset) != 2)
10096         error ("Dimension of charset %s is not two",
10097                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10098
10099       category = coding_category_big5;
10100       Vbig5_coding_system = name;
10101     }
10102   else if (EQ (coding_type, Qraw_text))
10103     {
10104       category = coding_category_raw_text;
10105       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10106     }
10107   else if (EQ (coding_type, Qutf_8))
10108     {
10109       Lisp_Object bom;
10110
10111       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10112
10113       if (nargs < coding_arg_utf8_max)
10114         goto short_args;
10115
10116       bom = args[coding_arg_utf8_bom];
10117       if (! NILP (bom) && ! EQ (bom, Qt))
10118         {
10119           CHECK_CONS (bom);
10120           val = XCAR (bom);
10121           CHECK_CODING_SYSTEM (val);
10122           val = XCDR (bom);
10123           CHECK_CODING_SYSTEM (val);
10124         }
10125       ASET (attrs, coding_attr_utf_bom, bom);
10126
10127       category = (CONSP (bom) ? coding_category_utf_8_auto
10128                   : NILP (bom) ? coding_category_utf_8_nosig
10129                   : coding_category_utf_8_sig);
10130     }
10131   else if (EQ (coding_type, Qundecided))
10132     category = coding_category_undecided;
10133   else
10134     error ("Invalid coding system type: %s",
10135            SDATA (SYMBOL_NAME (coding_type)));
10136
10137   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10138   CODING_ATTR_PLIST (attrs)
10139     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10140                                 CODING_ATTR_PLIST (attrs)));
10141   CODING_ATTR_PLIST (attrs)
10142     = Fcons (QCascii_compatible_p,
10143              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10144                     CODING_ATTR_PLIST (attrs)));
10145
10146   eol_type = args[coding_arg_eol_type];
10147   if (! NILP (eol_type)
10148       && ! EQ (eol_type, Qunix)
10149       && ! EQ (eol_type, Qdos)
10150       && ! EQ (eol_type, Qmac))
10151     error ("Invalid eol-type");
10152
10153   aliases = Fcons (name, Qnil);
10154
10155   if (NILP (eol_type))
10156     {
10157       eol_type = make_subsidiaries (name);
10158       for (i = 0; i < 3; i++)
10159         {
10160           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10161
10162           this_name = AREF (eol_type, i);
10163           this_aliases = Fcons (this_name, Qnil);
10164           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10165           this_spec = Fmake_vector (make_number (3), attrs);
10166           ASET (this_spec, 1, this_aliases);
10167           ASET (this_spec, 2, this_eol_type);
10168           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10169           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10170           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10171           if (NILP (val))
10172             Vcoding_system_alist
10173               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10174                        Vcoding_system_alist);
10175         }
10176     }
10177
10178   spec_vec = Fmake_vector (make_number (3), attrs);
10179   ASET (spec_vec, 1, aliases);
10180   ASET (spec_vec, 2, eol_type);
10181
10182   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10183   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10184   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10185   if (NILP (val))
10186     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10187                                   Vcoding_system_alist);
10188
10189   {
10190     int id = coding_categories[category].id;
10191
10192     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10193       setup_coding_system (name, &coding_categories[category]);
10194   }
10195
10196   return Qnil;
10197
10198  short_args:
10199   return Fsignal (Qwrong_number_of_arguments,
10200                   Fcons (intern ("define-coding-system-internal"),
10201                          make_number (nargs)));
10202 }
10203
10204
10205 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10206        3, 3, 0,
10207        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10208   (coding_system, prop, val)
10209      Lisp_Object coding_system, prop, val;
10210 {
10211   Lisp_Object spec, attrs;
10212
10213   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10214   attrs = AREF (spec, 0);
10215   if (EQ (prop, QCmnemonic))
10216     {
10217       if (! STRINGP (val))
10218         CHECK_CHARACTER (val);
10219       CODING_ATTR_MNEMONIC (attrs) = val;
10220     }
10221   else if (EQ (prop, QCdefault_char))
10222     {
10223       if (NILP (val))
10224         val = make_number (' ');
10225       else
10226         CHECK_CHARACTER (val);
10227       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10228     }
10229   else if (EQ (prop, QCdecode_translation_table))
10230     {
10231       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10232         CHECK_SYMBOL (val);
10233       CODING_ATTR_DECODE_TBL (attrs) = val;
10234     }
10235   else if (EQ (prop, QCencode_translation_table))
10236     {
10237       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10238         CHECK_SYMBOL (val);
10239       CODING_ATTR_ENCODE_TBL (attrs) = val;
10240     }
10241   else if (EQ (prop, QCpost_read_conversion))
10242     {
10243       CHECK_SYMBOL (val);
10244       CODING_ATTR_POST_READ (attrs) = val;
10245     }
10246   else if (EQ (prop, QCpre_write_conversion))
10247     {
10248       CHECK_SYMBOL (val);
10249       CODING_ATTR_PRE_WRITE (attrs) = val;
10250     }
10251   else if (EQ (prop, QCascii_compatible_p))
10252     {
10253       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10254     }
10255
10256   CODING_ATTR_PLIST (attrs)
10257     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10258   return val;
10259 }
10260
10261
10262 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10263        Sdefine_coding_system_alias, 2, 2, 0,
10264        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10265      (alias, coding_system)
10266      Lisp_Object alias, coding_system;
10267 {
10268   Lisp_Object spec, aliases, eol_type, val;
10269
10270   CHECK_SYMBOL (alias);
10271   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10272   aliases = AREF (spec, 1);
10273   /* ALIASES should be a list of length more than zero, and the first
10274      element is a base coding system.  Append ALIAS at the tail of the
10275      list.  */
10276   while (!NILP (XCDR (aliases)))
10277     aliases = XCDR (aliases);
10278   XSETCDR (aliases, Fcons (alias, Qnil));
10279
10280   eol_type = AREF (spec, 2);
10281   if (VECTORP (eol_type))
10282     {
10283       Lisp_Object subsidiaries;
10284       int i;
10285
10286       subsidiaries = make_subsidiaries (alias);
10287       for (i = 0; i < 3; i++)
10288         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10289                                      AREF (eol_type, i));
10290     }
10291
10292   Fputhash (alias, spec, Vcoding_system_hash_table);
10293   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10294   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10295   if (NILP (val))
10296     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10297                                   Vcoding_system_alist);
10298
10299   return Qnil;
10300 }
10301
10302 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10303        1, 1, 0,
10304        doc: /* Return the base of CODING-SYSTEM.
10305 Any alias or subsidiary coding system is not a base coding system.  */)
10306   (coding_system)
10307      Lisp_Object coding_system;
10308 {
10309   Lisp_Object spec, attrs;
10310
10311   if (NILP (coding_system))
10312     return (Qno_conversion);
10313   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10314   attrs = AREF (spec, 0);
10315   return CODING_ATTR_BASE_NAME (attrs);
10316 }
10317
10318 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10319        1, 1, 0,
10320        doc: "Return the property list of CODING-SYSTEM.")
10321      (coding_system)
10322      Lisp_Object coding_system;
10323 {
10324   Lisp_Object spec, attrs;
10325
10326   if (NILP (coding_system))
10327     coding_system = Qno_conversion;
10328   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10329   attrs = AREF (spec, 0);
10330   return CODING_ATTR_PLIST (attrs);
10331 }
10332
10333
10334 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10335        1, 1, 0,
10336        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10337      (coding_system)
10338      Lisp_Object coding_system;
10339 {
10340   Lisp_Object spec;
10341
10342   if (NILP (coding_system))
10343     coding_system = Qno_conversion;
10344   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10345   return AREF (spec, 1);
10346 }
10347
10348 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10349        Scoding_system_eol_type, 1, 1, 0,
10350        doc: /* Return eol-type of CODING-SYSTEM.
10351 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10352
10353 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10354 and CR respectively.
10355
10356 A vector value indicates that a format of end-of-line should be
10357 detected automatically.  Nth element of the vector is the subsidiary
10358 coding system whose eol-type is N.  */)
10359      (coding_system)
10360      Lisp_Object coding_system;
10361 {
10362   Lisp_Object spec, eol_type;
10363   int n;
10364
10365   if (NILP (coding_system))
10366     coding_system = Qno_conversion;
10367   if (! CODING_SYSTEM_P (coding_system))
10368     return Qnil;
10369   spec = CODING_SYSTEM_SPEC (coding_system);
10370   eol_type = AREF (spec, 2);
10371   if (VECTORP (eol_type))
10372     return Fcopy_sequence (eol_type);
10373   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10374   return make_number (n);
10375 }
10376
10377 #endif /* emacs */
10378
10379 \f
10380 /*** 9. Post-amble ***/
10381
10382 void
10383 init_coding_once ()
10384 {
10385   int i;
10386
10387   for (i = 0; i < coding_category_max; i++)
10388     {
10389       coding_categories[i].id = -1;
10390       coding_priorities[i] = i;
10391     }
10392
10393   /* ISO2022 specific initialize routine.  */
10394   for (i = 0; i < 0x20; i++)
10395     iso_code_class[i] = ISO_control_0;
10396   for (i = 0x21; i < 0x7F; i++)
10397     iso_code_class[i] = ISO_graphic_plane_0;
10398   for (i = 0x80; i < 0xA0; i++)
10399     iso_code_class[i] = ISO_control_1;
10400   for (i = 0xA1; i < 0xFF; i++)
10401     iso_code_class[i] = ISO_graphic_plane_1;
10402   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10403   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10404   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10405   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10406   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10407   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10408   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10409   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10410   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10411
10412   for (i = 0; i < 256; i++)
10413     {
10414       emacs_mule_bytes[i] = 1;
10415     }
10416   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10417   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10418   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10419   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10420 }
10421
10422 #ifdef emacs
10423
10424 void
10425 syms_of_coding ()
10426 {
10427   staticpro (&Vcoding_system_hash_table);
10428   {
10429     Lisp_Object args[2];
10430     args[0] = QCtest;
10431     args[1] = Qeq;
10432     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10433   }
10434
10435   staticpro (&Vsjis_coding_system);
10436   Vsjis_coding_system = Qnil;
10437
10438   staticpro (&Vbig5_coding_system);
10439   Vbig5_coding_system = Qnil;
10440
10441   staticpro (&Vcode_conversion_reused_workbuf);
10442   Vcode_conversion_reused_workbuf = Qnil;
10443
10444   staticpro (&Vcode_conversion_workbuf_name);
10445   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10446
10447   reused_workbuf_in_use = 0;
10448
10449   DEFSYM (Qcharset, "charset");
10450   DEFSYM (Qtarget_idx, "target-idx");
10451   DEFSYM (Qcoding_system_history, "coding-system-history");
10452   Fset (Qcoding_system_history, Qnil);
10453
10454   /* Target FILENAME is the first argument.  */
10455   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10456   /* Target FILENAME is the third argument.  */
10457   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10458
10459   DEFSYM (Qcall_process, "call-process");
10460   /* Target PROGRAM is the first argument.  */
10461   Fput (Qcall_process, Qtarget_idx, make_number (0));
10462
10463   DEFSYM (Qcall_process_region, "call-process-region");
10464   /* Target PROGRAM is the third argument.  */
10465   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10466
10467   DEFSYM (Qstart_process, "start-process");
10468   /* Target PROGRAM is the third argument.  */
10469   Fput (Qstart_process, Qtarget_idx, make_number (2));
10470
10471   DEFSYM (Qopen_network_stream, "open-network-stream");
10472   /* Target SERVICE is the fourth argument.  */
10473   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10474
10475   DEFSYM (Qcoding_system, "coding-system");
10476   DEFSYM (Qcoding_aliases, "coding-aliases");
10477
10478   DEFSYM (Qeol_type, "eol-type");
10479   DEFSYM (Qunix, "unix");
10480   DEFSYM (Qdos, "dos");
10481
10482   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10483   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10484   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10485   DEFSYM (Qdefault_char, "default-char");
10486   DEFSYM (Qundecided, "undecided");
10487   DEFSYM (Qno_conversion, "no-conversion");
10488   DEFSYM (Qraw_text, "raw-text");
10489
10490   DEFSYM (Qiso_2022, "iso-2022");
10491
10492   DEFSYM (Qutf_8, "utf-8");
10493   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10494
10495   DEFSYM (Qutf_16, "utf-16");
10496   DEFSYM (Qbig, "big");
10497   DEFSYM (Qlittle, "little");
10498
10499   DEFSYM (Qshift_jis, "shift-jis");
10500   DEFSYM (Qbig5, "big5");
10501
10502   DEFSYM (Qcoding_system_p, "coding-system-p");
10503
10504   DEFSYM (Qcoding_system_error, "coding-system-error");
10505   Fput (Qcoding_system_error, Qerror_conditions,
10506         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10507   Fput (Qcoding_system_error, Qerror_message,
10508         make_pure_c_string ("Invalid coding system"));
10509
10510   /* Intern this now in case it isn't already done.
10511      Setting this variable twice is harmless.
10512      But don't staticpro it here--that is done in alloc.c.  */
10513   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10514
10515   DEFSYM (Qtranslation_table, "translation-table");
10516   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10517   DEFSYM (Qtranslation_table_id, "translation-table-id");
10518   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10519   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10520
10521   DEFSYM (Qvalid_codes, "valid-codes");
10522
10523   DEFSYM (Qemacs_mule, "emacs-mule");
10524
10525   DEFSYM (QCcategory, ":category");
10526   DEFSYM (QCmnemonic, ":mnemonic");
10527   DEFSYM (QCdefault_char, ":default-char");
10528   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10529   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10530   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10531   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10532   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10533
10534   Vcoding_category_table
10535     = Fmake_vector (make_number (coding_category_max), Qnil);
10536   staticpro (&Vcoding_category_table);
10537   /* Followings are target of code detection.  */
10538   ASET (Vcoding_category_table, coding_category_iso_7,
10539         intern_c_string ("coding-category-iso-7"));
10540   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10541         intern_c_string ("coding-category-iso-7-tight"));
10542   ASET (Vcoding_category_table, coding_category_iso_8_1,
10543         intern_c_string ("coding-category-iso-8-1"));
10544   ASET (Vcoding_category_table, coding_category_iso_8_2,
10545         intern_c_string ("coding-category-iso-8-2"));
10546   ASET (Vcoding_category_table, coding_category_iso_7_else,
10547         intern_c_string ("coding-category-iso-7-else"));
10548   ASET (Vcoding_category_table, coding_category_iso_8_else,
10549         intern_c_string ("coding-category-iso-8-else"));
10550   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10551         intern_c_string ("coding-category-utf-8-auto"));
10552   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10553         intern_c_string ("coding-category-utf-8"));
10554   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10555         intern_c_string ("coding-category-utf-8-sig"));
10556   ASET (Vcoding_category_table, coding_category_utf_16_be,
10557         intern_c_string ("coding-category-utf-16-be"));
10558   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10559         intern_c_string ("coding-category-utf-16-auto"));
10560   ASET (Vcoding_category_table, coding_category_utf_16_le,
10561         intern_c_string ("coding-category-utf-16-le"));
10562   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10563         intern_c_string ("coding-category-utf-16-be-nosig"));
10564   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10565         intern_c_string ("coding-category-utf-16-le-nosig"));
10566   ASET (Vcoding_category_table, coding_category_charset,
10567         intern_c_string ("coding-category-charset"));
10568   ASET (Vcoding_category_table, coding_category_sjis,
10569         intern_c_string ("coding-category-sjis"));
10570   ASET (Vcoding_category_table, coding_category_big5,
10571         intern_c_string ("coding-category-big5"));
10572   ASET (Vcoding_category_table, coding_category_ccl,
10573         intern_c_string ("coding-category-ccl"));
10574   ASET (Vcoding_category_table, coding_category_emacs_mule,
10575         intern_c_string ("coding-category-emacs-mule"));
10576   /* Followings are NOT target of code detection.  */
10577   ASET (Vcoding_category_table, coding_category_raw_text,
10578         intern_c_string ("coding-category-raw-text"));
10579   ASET (Vcoding_category_table, coding_category_undecided,
10580         intern_c_string ("coding-category-undecided"));
10581
10582   DEFSYM (Qinsufficient_source, "insufficient-source");
10583   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10584   DEFSYM (Qinvalid_source, "invalid-source");
10585   DEFSYM (Qinterrupted, "interrupted");
10586   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10587   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10588
10589   defsubr (&Scoding_system_p);
10590   defsubr (&Sread_coding_system);
10591   defsubr (&Sread_non_nil_coding_system);
10592   defsubr (&Scheck_coding_system);
10593   defsubr (&Sdetect_coding_region);
10594   defsubr (&Sdetect_coding_string);
10595   defsubr (&Sfind_coding_systems_region_internal);
10596   defsubr (&Sunencodable_char_position);
10597   defsubr (&Scheck_coding_systems_region);
10598   defsubr (&Sdecode_coding_region);
10599   defsubr (&Sencode_coding_region);
10600   defsubr (&Sdecode_coding_string);
10601   defsubr (&Sencode_coding_string);
10602   defsubr (&Sdecode_sjis_char);
10603   defsubr (&Sencode_sjis_char);
10604   defsubr (&Sdecode_big5_char);
10605   defsubr (&Sencode_big5_char);
10606   defsubr (&Sset_terminal_coding_system_internal);
10607   defsubr (&Sset_safe_terminal_coding_system_internal);
10608   defsubr (&Sterminal_coding_system);
10609   defsubr (&Sset_keyboard_coding_system_internal);
10610   defsubr (&Skeyboard_coding_system);
10611   defsubr (&Sfind_operation_coding_system);
10612   defsubr (&Sset_coding_system_priority);
10613   defsubr (&Sdefine_coding_system_internal);
10614   defsubr (&Sdefine_coding_system_alias);
10615   defsubr (&Scoding_system_put);
10616   defsubr (&Scoding_system_base);
10617   defsubr (&Scoding_system_plist);
10618   defsubr (&Scoding_system_aliases);
10619   defsubr (&Scoding_system_eol_type);
10620   defsubr (&Scoding_system_priority_list);
10621
10622   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10623                doc: /* List of coding systems.
10624
10625 Do not alter the value of this variable manually.  This variable should be
10626 updated by the functions `define-coding-system' and
10627 `define-coding-system-alias'.  */);
10628   Vcoding_system_list = Qnil;
10629
10630   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10631                doc: /* Alist of coding system names.
10632 Each element is one element list of coding system name.
10633 This variable is given to `completing-read' as COLLECTION argument.
10634
10635 Do not alter the value of this variable manually.  This variable should be
10636 updated by the functions `make-coding-system' and
10637 `define-coding-system-alias'.  */);
10638   Vcoding_system_alist = Qnil;
10639
10640   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10641                doc: /* List of coding-categories (symbols) ordered by priority.
10642
10643 On detecting a coding system, Emacs tries code detection algorithms
10644 associated with each coding-category one by one in this order.  When
10645 one algorithm agrees with a byte sequence of source text, the coding
10646 system bound to the corresponding coding-category is selected.
10647
10648 Don't modify this variable directly, but use `set-coding-priority'.  */);
10649   {
10650     int i;
10651
10652     Vcoding_category_list = Qnil;
10653     for (i = coding_category_max - 1; i >= 0; i--)
10654       Vcoding_category_list
10655         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10656                  Vcoding_category_list);
10657   }
10658
10659   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10660                doc: /* Specify the coding system for read operations.
10661 It is useful to bind this variable with `let', but do not set it globally.
10662 If the value is a coding system, it is used for decoding on read operation.
10663 If not, an appropriate element is used from one of the coding system alists.
10664 There are three such tables: `file-coding-system-alist',
10665 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10666   Vcoding_system_for_read = Qnil;
10667
10668   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10669                doc: /* Specify the coding system for write operations.
10670 Programs bind this variable with `let', but you should not set it globally.
10671 If the value is a coding system, it is used for encoding of output,
10672 when writing it to a file and when sending it to a file or subprocess.
10673
10674 If this does not specify a coding system, an appropriate element
10675 is used from one of the coding system alists.
10676 There are three such tables: `file-coding-system-alist',
10677 `process-coding-system-alist', and `network-coding-system-alist'.
10678 For output to files, if the above procedure does not specify a coding system,
10679 the value of `buffer-file-coding-system' is used.  */);
10680   Vcoding_system_for_write = Qnil;
10681
10682   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10683                doc: /*
10684 Coding system used in the latest file or process I/O.  */);
10685   Vlast_coding_system_used = Qnil;
10686
10687   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10688                doc: /*
10689 Error status of the last code conversion.
10690
10691 When an error was detected in the last code conversion, this variable
10692 is set to one of the following symbols.
10693   `insufficient-source'
10694   `inconsistent-eol'
10695   `invalid-source'
10696   `interrupted'
10697   `insufficient-memory'
10698 When no error was detected, the value doesn't change.  So, to check
10699 the error status of a code conversion by this variable, you must
10700 explicitly set this variable to nil before performing code
10701 conversion.  */);
10702   Vlast_code_conversion_error = Qnil;
10703
10704   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10705                doc: /*
10706 *Non-nil means always inhibit code conversion of end-of-line format.
10707 See info node `Coding Systems' and info node `Text and Binary' concerning
10708 such conversion.  */);
10709   inhibit_eol_conversion = 0;
10710
10711   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10712                doc: /*
10713 Non-nil means process buffer inherits coding system of process output.
10714 Bind it to t if the process output is to be treated as if it were a file
10715 read from some filesystem.  */);
10716   inherit_process_coding_system = 0;
10717
10718   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10719                doc: /*
10720 Alist to decide a coding system to use for a file I/O operation.
10721 The format is ((PATTERN . VAL) ...),
10722 where PATTERN is a regular expression matching a file name,
10723 VAL is a coding system, a cons of coding systems, or a function symbol.
10724 If VAL is a coding system, it is used for both decoding and encoding
10725 the file contents.
10726 If VAL is a cons of coding systems, the car part is used for decoding,
10727 and the cdr part is used for encoding.
10728 If VAL is a function symbol, the function must return a coding system
10729 or a cons of coding systems which are used as above.  The function is
10730 called with an argument that is a list of the arguments with which
10731 `find-operation-coding-system' was called.  If the function can't decide
10732 a coding system, it can return `undecided' so that the normal
10733 code-detection is performed.
10734
10735 See also the function `find-operation-coding-system'
10736 and the variable `auto-coding-alist'.  */);
10737   Vfile_coding_system_alist = Qnil;
10738
10739   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10740                doc: /*
10741 Alist to decide a coding system to use for a process I/O operation.
10742 The format is ((PATTERN . VAL) ...),
10743 where PATTERN is a regular expression matching a program name,
10744 VAL is a coding system, a cons of coding systems, or a function symbol.
10745 If VAL is a coding system, it is used for both decoding what received
10746 from the program and encoding what sent to the program.
10747 If VAL is a cons of coding systems, the car part is used for decoding,
10748 and the cdr part is used for encoding.
10749 If VAL is a function symbol, the function must return a coding system
10750 or a cons of coding systems which are used as above.
10751
10752 See also the function `find-operation-coding-system'.  */);
10753   Vprocess_coding_system_alist = Qnil;
10754
10755   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10756                doc: /*
10757 Alist to decide a coding system to use for a network I/O operation.
10758 The format is ((PATTERN . VAL) ...),
10759 where PATTERN is a regular expression matching a network service name
10760 or is a port number to connect to,
10761 VAL is a coding system, a cons of coding systems, or a function symbol.
10762 If VAL is a coding system, it is used for both decoding what received
10763 from the network stream and encoding what sent to the network stream.
10764 If VAL is a cons of coding systems, the car part is used for decoding,
10765 and the cdr part is used for encoding.
10766 If VAL is a function symbol, the function must return a coding system
10767 or a cons of coding systems which are used as above.
10768
10769 See also the function `find-operation-coding-system'.  */);
10770   Vnetwork_coding_system_alist = Qnil;
10771
10772   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10773                doc: /* Coding system to use with system messages.
10774 Also used for decoding keyboard input on X Window system.  */);
10775   Vlocale_coding_system = Qnil;
10776
10777   /* The eol mnemonics are reset in startup.el system-dependently.  */
10778   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10779                doc: /*
10780 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10781   eol_mnemonic_unix = make_pure_c_string (":");
10782
10783   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10784                doc: /*
10785 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10786   eol_mnemonic_dos = make_pure_c_string ("\\");
10787
10788   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10789                doc: /*
10790 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10791   eol_mnemonic_mac = make_pure_c_string ("/");
10792
10793   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10794                doc: /*
10795 *String displayed in mode line when end-of-line format is not yet determined.  */);
10796   eol_mnemonic_undecided = make_pure_c_string (":");
10797
10798   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10799                doc: /*
10800 *Non-nil enables character translation while encoding and decoding.  */);
10801   Venable_character_translation = Qt;
10802
10803   DEFVAR_LISP ("standard-translation-table-for-decode",
10804                &Vstandard_translation_table_for_decode,
10805                doc: /* Table for translating characters while decoding.  */);
10806   Vstandard_translation_table_for_decode = Qnil;
10807
10808   DEFVAR_LISP ("standard-translation-table-for-encode",
10809                &Vstandard_translation_table_for_encode,
10810                doc: /* Table for translating characters while encoding.  */);
10811   Vstandard_translation_table_for_encode = Qnil;
10812
10813   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10814                doc: /* Alist of charsets vs revision numbers.
10815 While encoding, if a charset (car part of an element) is found,
10816 designate it with the escape sequence identifying revision (cdr part
10817 of the element).  */);
10818   Vcharset_revision_table = Qnil;
10819
10820   DEFVAR_LISP ("default-process-coding-system",
10821                &Vdefault_process_coding_system,
10822                doc: /* Cons of coding systems used for process I/O by default.
10823 The car part is used for decoding a process output,
10824 the cdr part is used for encoding a text to be sent to a process.  */);
10825   Vdefault_process_coding_system = Qnil;
10826
10827   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10828                doc: /*
10829 Table of extra Latin codes in the range 128..159 (inclusive).
10830 This is a vector of length 256.
10831 If Nth element is non-nil, the existence of code N in a file
10832 \(or output of subprocess) doesn't prevent it to be detected as
10833 a coding system of ISO 2022 variant which has a flag
10834 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10835 or reading output of a subprocess.
10836 Only 128th through 159th elements have a meaning.  */);
10837   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10838
10839   DEFVAR_LISP ("select-safe-coding-system-function",
10840                &Vselect_safe_coding_system_function,
10841                doc: /*
10842 Function to call to select safe coding system for encoding a text.
10843
10844 If set, this function is called to force a user to select a proper
10845 coding system which can encode the text in the case that a default
10846 coding system used in each operation can't encode the text.  The
10847 function should take care that the buffer is not modified while
10848 the coding system is being selected.
10849
10850 The default value is `select-safe-coding-system' (which see).  */);
10851   Vselect_safe_coding_system_function = Qnil;
10852
10853   DEFVAR_BOOL ("coding-system-require-warning",
10854                &coding_system_require_warning,
10855                doc: /* Internal use only.
10856 If non-nil, on writing a file, `select-safe-coding-system-function' is
10857 called even if `coding-system-for-write' is non-nil.  The command
10858 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10859   coding_system_require_warning = 0;
10860
10861
10862   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10863                &inhibit_iso_escape_detection,
10864                doc: /*
10865 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10866
10867 When Emacs reads text, it tries to detect how the text is encoded.
10868 This code detection is sensitive to escape sequences.  If Emacs sees
10869 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10870 of the ISO2022 encodings, and decodes text by the corresponding coding
10871 system (e.g. `iso-2022-7bit').
10872
10873 However, there may be a case that you want to read escape sequences in
10874 a file as is.  In such a case, you can set this variable to non-nil.
10875 Then the code detection will ignore any escape sequences, and no text is
10876 detected as encoded in some ISO-2022 encoding.  The result is that all
10877 escape sequences become visible in a buffer.
10878
10879 The default value is nil, and it is strongly recommended not to change
10880 it.  That is because many Emacs Lisp source files that contain
10881 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10882 in Emacs's distribution, and they won't be decoded correctly on
10883 reading if you suppress escape sequence detection.
10884
10885 The other way to read escape sequences in a file without decoding is
10886 to explicitly specify some coding system that doesn't use ISO-2022
10887 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10888   inhibit_iso_escape_detection = 0;
10889
10890   DEFVAR_BOOL ("inhibit-null-byte-detection",
10891                &inhibit_null_byte_detection,
10892                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10893 By default, Emacs treats it as binary data, and does not attempt to
10894 decode it.  The effect is as if you specified `no-conversion' for
10895 reading that text.
10896
10897 Set this to non-nil when a regular text happens to include null bytes.
10898 Examples are Index nodes of Info files and null-byte delimited output
10899 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10900 decode text as usual.  */);
10901   inhibit_null_byte_detection = 0;
10902
10903   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10904                doc: /* Char table for translating self-inserting characters.
10905 This is applied to the result of input methods, not their input.
10906 See also `keyboard-translate-table'.
10907
10908 Use of this variable for character code unification was rendered
10909 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10910 internal character representation.  */);
10911     Vtranslation_table_for_input = Qnil;
10912
10913   {
10914     Lisp_Object args[coding_arg_max];
10915     Lisp_Object plist[16];
10916     int i;
10917
10918     for (i = 0; i < coding_arg_max; i++)
10919       args[i] = Qnil;
10920
10921     plist[0] = intern_c_string (":name");
10922     plist[1] = args[coding_arg_name] = Qno_conversion;
10923     plist[2] = intern_c_string (":mnemonic");
10924     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10925     plist[4] = intern_c_string (":coding-type");
10926     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10927     plist[6] = intern_c_string (":ascii-compatible-p");
10928     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10929     plist[8] = intern_c_string (":default-char");
10930     plist[9] = args[coding_arg_default_char] = make_number (0);
10931     plist[10] = intern_c_string (":for-unibyte");
10932     plist[11] = args[coding_arg_for_unibyte] = Qt;
10933     plist[12] = intern_c_string (":docstring");
10934     plist[13] = make_pure_c_string ("Do no conversion.\n\
10935 \n\
10936 When you visit a file with this coding, the file is read into a\n\
10937 unibyte buffer as is, thus each byte of a file is treated as a\n\
10938 character.");
10939     plist[14] = intern_c_string (":eol-type");
10940     plist[15] = args[coding_arg_eol_type] = Qunix;
10941     args[coding_arg_plist] = Flist (16, plist);
10942     Fdefine_coding_system_internal (coding_arg_max, args);
10943
10944     plist[1] = args[coding_arg_name] = Qundecided;
10945     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10946     plist[5] = args[coding_arg_coding_type] = Qundecided;
10947     /* This is already set.
10948        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10949     plist[8] = intern_c_string (":charset-list");
10950     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10951     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10952     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10953     plist[15] = args[coding_arg_eol_type] = Qnil;
10954     args[coding_arg_plist] = Flist (16, plist);
10955     Fdefine_coding_system_internal (coding_arg_max, args);
10956   }
10957
10958   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10959
10960   {
10961     int i;
10962
10963     for (i = 0; i < coding_category_max; i++)
10964       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10965   }
10966 #if defined (MSDOS) || defined (WINDOWSNT)
10967   system_eol_type = Qdos;
10968 #else
10969   system_eol_type = Qunix;
10970 #endif
10971   staticpro (&system_eol_type);
10972 }
10973
10974 char *
10975 emacs_strerror (error_number)
10976      int error_number;
10977 {
10978   char *str;
10979
10980   synchronize_system_messages_locale ();
10981   str = strerror (error_number);
10982
10983   if (! NILP (Vlocale_coding_system))
10984     {
10985       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10986                                                       Vlocale_coding_system,
10987                                                       0);
10988       str = (char *) SDATA (dec);
10989     }
10990
10991   return str;
10992 }
10993
10994 #endif /* emacs */
10995
10996 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10997    (do not change this comment) */