src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292 #include <setjmp.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302 #include "frame.h"
 303 #include "termhooks.h"
 304
 305 Lisp_Object Vcoding_system_hash_table;
 306
 307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 308 Lisp_Object Qunix, Qdos;
 309 extern Lisp_Object Qmac;        /* frame.c */
 310 Lisp_Object Qbuffer_file_coding_system;
 311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 315 Lisp_Object Qbig, Qlittle;
 316 Lisp_Object Qcoding_system_history;
 317 Lisp_Object Qvalid_codes;
 318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 321 Lisp_Object QCascii_compatible_p;
 322
 323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 324 Lisp_Object Qcall_process, Qcall_process_region;
 325 Lisp_Object Qstart_process, Qopen_network_stream;
 326 Lisp_Object Qtarget_idx;
 327
 328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 329 Lisp_Object Qinterrupted, Qinsufficient_memory;
 330
 331 extern Lisp_Object Qcompletion_ignore_case;
 332
 333 /* If a symbol has this property, evaluate the value to define the
 334    symbol as a coding system.  */
 335 static Lisp_Object Qcoding_system_define_form;
 336
 337 int coding_system_require_warning;
 338
 339 Lisp_Object Vselect_safe_coding_system_function;
 340
 341 /* Mnemonic string for each format of end-of-line.  */
 342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 343 /* Mnemonic string to indicate format of end-of-line is not yet
 344    decided.  */
 345 Lisp_Object eol_mnemonic_undecided;
 346
 347 /* Format of end-of-line decided by system.  This is Qunix on
 348    Unix and Mac, Qdos on DOS/Windows.
 349    This has an effect only for external encoding (i.e. for output to
 350    file and process), not for in-buffer or Lisp string encoding.  */
 351 static Lisp_Object system_eol_type;
 352
 353 #ifdef emacs
 354
 355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 356
 357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 358
 359 /* Coding system emacs-mule and raw-text are for converting only
 360    end-of-line format.  */
 361 Lisp_Object Qemacs_mule, Qraw_text;
 362 Lisp_Object Qutf_8_emacs;
 363
 364 /* Coding-systems are handed between Emacs Lisp programs and C internal
 365    routines by the following three variables.  */
 366 /* Coding-system for reading files and receiving data from process.  */
 367 Lisp_Object Vcoding_system_for_read;
 368 /* Coding-system for writing files and sending data to process.  */
 369 Lisp_Object Vcoding_system_for_write;
 370 /* Coding-system actually used in the latest I/O.  */
 371 Lisp_Object Vlast_coding_system_used;
 372 /* Set to non-nil when an error is detected while code conversion.  */
 373 Lisp_Object Vlast_code_conversion_error;
 374 /* A vector of length 256 which contains information about special
 375    Latin codes (especially for dealing with Microsoft codes).  */
 376 Lisp_Object Vlatin_extra_code_table;
 377
 378 /* Flag to inhibit code conversion of end-of-line format.  */
 379 int inhibit_eol_conversion;
 380
 381 /* Flag to inhibit ISO2022 escape sequence detection.  */
 382 int inhibit_iso_escape_detection;
 383
 384 /* Flag to inhibit detection of binary files through null bytes.  */
 385 int inhibit_null_byte_detection;
 386
 387 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 388 int inherit_process_coding_system;
 389
 390 /* Coding system to be used to encode text for terminal display when
 391    terminal coding system is nil.  */
 392 struct coding_system safe_terminal_coding;
 393
 394 Lisp_Object Vfile_coding_system_alist;
 395 Lisp_Object Vprocess_coding_system_alist;
 396 Lisp_Object Vnetwork_coding_system_alist;
 397
 398 Lisp_Object Vlocale_coding_system;
 399
 400 #endif /* emacs */
 401
 402 /* Flag to tell if we look up translation table on character code
 403    conversion.  */
 404 Lisp_Object Venable_character_translation;
 405 /* Standard translation table to look up on decoding (reading).  */
 406 Lisp_Object Vstandard_translation_table_for_decode;
 407 /* Standard translation table to look up on encoding (writing).  */
 408 Lisp_Object Vstandard_translation_table_for_encode;
 409
 410 Lisp_Object Qtranslation_table;
 411 Lisp_Object Qtranslation_table_id;
 412 Lisp_Object Qtranslation_table_for_decode;
 413 Lisp_Object Qtranslation_table_for_encode;
 414
 415 /* Alist of charsets vs revision number.  */
 416 static Lisp_Object Vcharset_revision_table;
 417
 418 /* Default coding systems used for process I/O.  */
 419 Lisp_Object Vdefault_process_coding_system;
 420
 421 /* Char table for translating Quail and self-inserting input.  */
 422 Lisp_Object Vtranslation_table_for_input;
 423
 424 /* Two special coding systems.  */
 425 Lisp_Object Vsjis_coding_system;
 426 Lisp_Object Vbig5_coding_system;
 427
 428 /* ISO2022 section */
 429
 430 #define CODING_ISO_INITIAL(coding, reg)                 \
 431   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 432                      coding_attr_iso_initial),          \
 433                reg)))
 434
 435
 436 #define CODING_ISO_REQUEST(coding, charset_id)          \
 437   (((charset_id) <= (coding)->max_charset_id            \
 438     ? ((coding)->safe_charsets[charset_id] != 255       \
 439        ? (coding)->safe_charsets[charset_id]            \
 440        : -1)                                            \
 441     : -1))
 442
 443
 444 #define CODING_ISO_FLAGS(coding)        \
 445   ((coding)->spec.iso_2022.flags)
 446 #define CODING_ISO_DESIGNATION(coding, reg)     \
 447   ((coding)->spec.iso_2022.current_designation[reg])
 448 #define CODING_ISO_INVOCATION(coding, plane)    \
 449   ((coding)->spec.iso_2022.current_invocation[plane])
 450 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 451   ((coding)->spec.iso_2022.single_shifting)
 452 #define CODING_ISO_BOL(coding)  \
 453   ((coding)->spec.iso_2022.bol)
 454 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 455   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 456 #define CODING_ISO_CMP_STATUS(coding)   \
 457   (&(coding)->spec.iso_2022.cmp_status)
 458 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 459   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 460 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 461   ((coding)->spec.iso_2022.embedded_utf_8)
 462
 463 /* Control characters of ISO2022.  */
 464                         /* code */      /* function */
 465 #define ISO_CODE_LF     0x0A            /* line-feed */
 466 #define ISO_CODE_CR     0x0D            /* carriage-return */
 467 #define ISO_CODE_SO     0x0E            /* shift-out */
 468 #define ISO_CODE_SI     0x0F            /* shift-in */
 469 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 470 #define ISO_CODE_ESC    0x1B            /* escape */
 471 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 472 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 473 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 474
 475 /* All code (1-byte) of ISO2022 is classified into one of the
 476    followings.  */
 477 enum iso_code_class_type
 478   {
 479     ISO_control_0,              /* Control codes in the range
 480                                    0x00..0x1F and 0x7F, except for the
 481                                    following 5 codes.  */
 482     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 483     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 484     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 485     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 486     ISO_control_1,              /* Control codes in the range
 487                                    0x80..0x9F, except for the
 488                                    following 3 codes.  */
 489     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 490     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 491     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 492     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 493     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 494     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 495     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 496   };
 497
 498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 499     `iso-flags' attribute of an iso2022 coding system.  */
 500
 501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 502    instead of the correct short-form sequence (e.g. ESC $ A).  */
 503 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 504
 505 /* If set, reset graphic planes and registers at end-of-line to the
 506    initial state.  */
 507 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 508
 509 /* If set, reset graphic planes and registers before any control
 510    characters to the initial state.  */
 511 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 512
 513 /* If set, encode by 7-bit environment.  */
 514 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 515
 516 /* If set, use locking-shift function.  */
 517 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 518
 519 /* If set, use single-shift function.  Overwrite
 520    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 521 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 522
 523 /* If set, use designation escape sequence.  */
 524 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 525
 526 /* If set, produce revision number sequence.  */
 527 #define CODING_ISO_FLAG_REVISION        0x0080
 528
 529 /* If set, produce ISO6429's direction specifying sequence.  */
 530 #define CODING_ISO_FLAG_DIRECTION       0x0100
 531
 532 /* If set, assume designation states are reset at beginning of line on
 533    output.  */
 534 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 535
 536 /* If set, designation sequence should be placed at beginning of line
 537    on output.  */
 538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 539
 540 /* If set, do not encode unsafe charactes on output.  */
 541 #define CODING_ISO_FLAG_SAFE            0x0800
 542
 543 /* If set, extra latin codes (128..159) are accepted as a valid code
 544    on input.  */
 545 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 546
 547 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 548
 549 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 550
 551 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 552
 553 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 554
 555 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 556
 557 /* A character to be produced on output if encoding of the original
 558    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 560
 561 /* UTF-8 section */
 562 #define CODING_UTF_8_BOM(coding)        \
 563   ((coding)->spec.utf_8_bom)
 564
 565 /* UTF-16 section */
 566 #define CODING_UTF_16_BOM(coding)       \
 567   ((coding)->spec.utf_16.bom)
 568
 569 #define CODING_UTF_16_ENDIAN(coding)    \
 570   ((coding)->spec.utf_16.endian)
 571
 572 #define CODING_UTF_16_SURROGATE(coding) \
 573   ((coding)->spec.utf_16.surrogate)
 574
 575
 576 /* CCL section */
 577 #define CODING_CCL_DECODER(coding)      \
 578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 579 #define CODING_CCL_ENCODER(coding)      \
 580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 581 #define CODING_CCL_VALIDS(coding)                                          \
 582   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 583
 584 /* Index for each coding category in `coding_categories' */
 585
 586 enum coding_category
 587   {
 588     coding_category_iso_7,
 589     coding_category_iso_7_tight,
 590     coding_category_iso_8_1,
 591     coding_category_iso_8_2,
 592     coding_category_iso_7_else,
 593     coding_category_iso_8_else,
 594     coding_category_utf_8_auto,
 595     coding_category_utf_8_nosig,
 596     coding_category_utf_8_sig,
 597     coding_category_utf_16_auto,
 598     coding_category_utf_16_be,
 599     coding_category_utf_16_le,
 600     coding_category_utf_16_be_nosig,
 601     coding_category_utf_16_le_nosig,
 602     coding_category_charset,
 603     coding_category_sjis,
 604     coding_category_big5,
 605     coding_category_ccl,
 606     coding_category_emacs_mule,
 607     /* All above are targets of code detection.  */
 608     coding_category_raw_text,
 609     coding_category_undecided,
 610     coding_category_max
 611   };
 612
 613 /* Definitions of flag bits used in detect_coding_XXXX.  */
 614 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 615 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 616 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 617 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 618 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 619 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 620 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 621 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 622 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 623 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8_AUTO           \
 645    | CATEGORY_MASK_UTF_8_NOSIG          \
 646    | CATEGORY_MASK_UTF_8_SIG            \
 647    | CATEGORY_MASK_UTF_16_AUTO          \
 648    | CATEGORY_MASK_UTF_16_BE            \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 652    | CATEGORY_MASK_CHARSET              \
 653    | CATEGORY_MASK_SJIS                 \
 654    | CATEGORY_MASK_BIG5                 \
 655    | CATEGORY_MASK_CCL                  \
 656    | CATEGORY_MASK_EMACS_MULE)
 657
 658
 659 #define CATEGORY_MASK_ISO_7BIT \
 660   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 661
 662 #define CATEGORY_MASK_ISO_8BIT \
 663   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 664
 665 #define CATEGORY_MASK_ISO_ELSE \
 666   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 667
 668 #define CATEGORY_MASK_ISO_ESCAPE        \
 669   (CATEGORY_MASK_ISO_7                  \
 670    | CATEGORY_MASK_ISO_7_TIGHT          \
 671    | CATEGORY_MASK_ISO_7_ELSE           \
 672    | CATEGORY_MASK_ISO_8_ELSE)
 673
 674 #define CATEGORY_MASK_ISO       \
 675   (  CATEGORY_MASK_ISO_7BIT     \
 676      | CATEGORY_MASK_ISO_8BIT   \
 677      | CATEGORY_MASK_ISO_ELSE)
 678
 679 #define CATEGORY_MASK_UTF_16            \
 680   (CATEGORY_MASK_UTF_16_AUTO            \
 681    | CATEGORY_MASK_UTF_16_BE            \
 682    | CATEGORY_MASK_UTF_16_LE            \
 683    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 684    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 685
 686 #define CATEGORY_MASK_UTF_8     \
 687   (CATEGORY_MASK_UTF_8_AUTO     \
 688    | CATEGORY_MASK_UTF_8_NOSIG  \
 689    | CATEGORY_MASK_UTF_8_SIG)
 690
 691 /* List of symbols `coding-category-xxx' ordered by priority.  This
 692    variable is exposed to Emacs Lisp.  */
 693 static Lisp_Object Vcoding_category_list;
 694
 695 /* Table of coding categories (Lisp symbols).  This variable is for
 696    internal use oly.  */
 697 static Lisp_Object Vcoding_category_table;
 698
 699 /* Table of coding-categories ordered by priority.  */
 700 static enum coding_category coding_priorities[coding_category_max];
 701
 702 /* Nth element is a coding context for the coding system bound to the
 703    Nth coding category.  */
 704 static struct coding_system coding_categories[coding_category_max];
 705
 706 /*** Commonly used macros and functions ***/
 707
 708 #ifndef min
 709 #define min(a, b) ((a) < (b) ? (a) : (b))
 710 #endif
 711 #ifndef max
 712 #define max(a, b) ((a) > (b) ? (a) : (b))
 713 #endif
 714
 715 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 716   do {                                                  \
 717     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 718     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 719   } while (0)
 720
 721
 722 /* Safely get one byte from the source text pointed by SRC which ends
 723    at SRC_END, and set C to that byte.  If there are not enough bytes
 724    in the source, it jumps to `no_more_source'.  If multibytep is
 725    nonzero, and a multibyte character is found at SRC, set C to the
 726    negative value of the character code.  The caller should declare
 727    and set these variables appropriately in advance:
 728         src, src_end, multibytep */
 729
 730 #define ONE_MORE_BYTE(c)                                \
 731   do {                                                  \
 732     if (src == src_end)                                 \
 733       {                                                 \
 734         if (src_base < src)                             \
 735           record_conversion_result                      \
 736             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 737         goto no_more_source;                            \
 738       }                                                 \
 739     c = *src++;                                         \
 740     if (multibytep && (c & 0x80))                       \
 741       {                                                 \
 742         if ((c & 0xFE) == 0xC0)                         \
 743           c = ((c & 1) << 6) | *src++;                  \
 744         else                                            \
 745           {                                             \
 746             src--;                                      \
 747             c = - string_char (src, &src, NULL);        \
 748             record_conversion_result                    \
 749               (coding, CODING_RESULT_INVALID_SRC);      \
 750           }                                             \
 751       }                                                 \
 752     consumed_chars++;                                   \
 753   } while (0)
 754
 755 /* Safely get two bytes from the source text pointed by SRC which ends
 756    at SRC_END, and set C1 and C2 to those bytes while skipping the
 757    heading multibyte characters.  If there are not enough bytes in the
 758    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 759    a multibyte character is found for C2, set C2 to the negative value
 760    of the character code.  The caller should declare and set these
 761    variables appropriately in advance:
 762         src, src_end, multibytep
 763    It is intended that this macro is used in detect_coding_utf_16.  */
 764
 765 #define TWO_MORE_BYTES(c1, c2)                          \
 766   do {                                                  \
 767     do {                                                \
 768       if (src == src_end)                               \
 769         goto no_more_source;                            \
 770       c1 = *src++;                                      \
 771       if (multibytep && (c1 & 0x80))                    \
 772         {                                               \
 773           if ((c1 & 0xFE) == 0xC0)                      \
 774             c1 = ((c1 & 1) << 6) | *src++;              \
 775           else                                          \
 776             {                                           \
 777               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 778               c1 = -1;                                  \
 779             }                                           \
 780         }                                               \
 781     } while (c1 < 0);                                   \
 782     if (src == src_end)                                 \
 783       goto no_more_source;                              \
 784     c2 = *src++;                                        \
 785     if (multibytep && (c2 & 0x80))                      \
 786       {                                                 \
 787         if ((c2 & 0xFE) == 0xC0)                        \
 788           c2 = ((c2 & 1) << 6) | *src++;                \
 789         else                                            \
 790           c2 = -1;                                      \
 791       }                                                 \
 792   } while (0)
 793
 794
 795 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 796   do {                                                  \
 797     c = *src++;                                         \
 798     if (multibytep && (c & 0x80))                       \
 799       {                                                 \
 800         if ((c & 0xFE) == 0xC0)                         \
 801           c = ((c & 1) << 6) | *src++;                  \
 802         else                                            \
 803           {                                             \
 804             src--;                                      \
 805             c = - string_char (src, &src, NULL);        \
 806             record_conversion_result                    \
 807               (coding, CODING_RESULT_INVALID_SRC);      \
 808           }                                             \
 809       }                                                 \
 810     consumed_chars++;                                   \
 811   } while (0)
 812
 813
 814 /* Store a byte C in the place pointed by DST and increment DST to the
 815    next free point, and increment PRODUCED_CHARS.  The caller should
 816    assure that C is 0..127, and declare and set the variable `dst'
 817    appropriately in advance.
 818 */
 819
 820
 821 #define EMIT_ONE_ASCII_BYTE(c)  \
 822   do {                          \
 823     produced_chars++;           \
 824     *dst++ = (c);               \
 825   } while (0)
 826
 827
 828 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 829
 830 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 831   do {                                  \
 832     produced_chars += 2;                \
 833     *dst++ = (c1), *dst++ = (c2);       \
 834   } while (0)
 835
 836
 837 /* Store a byte C in the place pointed by DST and increment DST to the
 838    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 839    nonzero, store in an appropriate multibyte from.  The caller should
 840    declare and set the variables `dst' and `multibytep' appropriately
 841    in advance.  */
 842
 843 #define EMIT_ONE_BYTE(c)                \
 844   do {                                  \
 845     produced_chars++;                   \
 846     if (multibytep)                     \
 847       {                                 \
 848         int ch = (c);                   \
 849         if (ch >= 0x80)                 \
 850           ch = BYTE8_TO_CHAR (ch);      \
 851         CHAR_STRING_ADVANCE (ch, dst);  \
 852       }                                 \
 853     else                                \
 854       *dst++ = (c);                     \
 855   } while (0)
 856
 857
 858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 859
 860 #define EMIT_TWO_BYTES(c1, c2)          \
 861   do {                                  \
 862     produced_chars += 2;                \
 863     if (multibytep)                     \
 864       {                                 \
 865         int ch;                         \
 866                                         \
 867         ch = (c1);                      \
 868         if (ch >= 0x80)                 \
 869           ch = BYTE8_TO_CHAR (ch);      \
 870         CHAR_STRING_ADVANCE (ch, dst);  \
 871         ch = (c2);                      \
 872         if (ch >= 0x80)                 \
 873           ch = BYTE8_TO_CHAR (ch);      \
 874         CHAR_STRING_ADVANCE (ch, dst);  \
 875       }                                 \
 876     else                                \
 877       {                                 \
 878         *dst++ = (c1);                  \
 879         *dst++ = (c2);                  \
 880       }                                 \
 881   } while (0)
 882
 883
 884 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 885   do {                                  \
 886     EMIT_ONE_BYTE (c1);                 \
 887     EMIT_TWO_BYTES (c2, c3);            \
 888   } while (0)
 889
 890
 891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 892   do {                                          \
 893     EMIT_TWO_BYTES (c1, c2);                    \
 894     EMIT_TWO_BYTES (c3, c4);                    \
 895   } while (0)
 896
 897
 898 /* Prototypes for static functions.  */
 899 static void record_conversion_result P_ ((struct coding_system *coding,
 900                                           enum coding_result_code result));
 901 static int detect_coding_utf_8 P_ ((struct coding_system *,
 902                                     struct coding_detection_info *info));
 903 static void decode_coding_utf_8 P_ ((struct coding_system *));
 904 static int encode_coding_utf_8 P_ ((struct coding_system *));
 905
 906 static int detect_coding_utf_16 P_ ((struct coding_system *,
 907                                      struct coding_detection_info *info));
 908 static void decode_coding_utf_16 P_ ((struct coding_system *));
 909 static int encode_coding_utf_16 P_ ((struct coding_system *));
 910
 911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 912                                        struct coding_detection_info *info));
 913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 915
 916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 917                                          struct coding_detection_info *info));
 918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 920
 921 static int detect_coding_sjis P_ ((struct coding_system *,
 922                                    struct coding_detection_info *info));
 923 static void decode_coding_sjis P_ ((struct coding_system *));
 924 static int encode_coding_sjis P_ ((struct coding_system *));
 925
 926 static int detect_coding_big5 P_ ((struct coding_system *,
 927                                    struct coding_detection_info *info));
 928 static void decode_coding_big5 P_ ((struct coding_system *));
 929 static int encode_coding_big5 P_ ((struct coding_system *));
 930
 931 static int detect_coding_ccl P_ ((struct coding_system *,
 932                                   struct coding_detection_info *info));
 933 static void decode_coding_ccl P_ ((struct coding_system *));
 934 static int encode_coding_ccl P_ ((struct coding_system *));
 935
 936 static void decode_coding_raw_text P_ ((struct coding_system *));
 937 static int encode_coding_raw_text P_ ((struct coding_system *));
 938
 939 static void coding_set_source P_ ((struct coding_system *));
 940 static void coding_set_destination P_ ((struct coding_system *));
 941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 943                                             EMACS_INT, EMACS_INT));
 944 static unsigned char *alloc_destination P_ ((struct coding_system *,
 945                                              EMACS_INT, unsigned char *));
 946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 947 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 948                                                      int *, int *,
 949                                                      unsigned char *));
 950 static int detect_eol P_ ((const unsigned char *,
 951                            EMACS_INT, enum coding_category));
 952 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 953 static void decode_eol P_ ((struct coding_system *));
 954 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 955 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 956 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 957 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 958                                         EMACS_INT));
 959 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 960 static int decode_coding P_ ((struct coding_system *));
 961 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 962                                                       struct coding_system *,
 963                                                       int *, EMACS_INT *));
 964 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 965                                                   struct coding_system *,
 966                                                   int *, EMACS_INT *));
 967 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 968 static int encode_coding P_ ((struct coding_system *));
 969 static Lisp_Object make_conversion_work_buffer P_ ((int));
 970 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 971 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 972 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 973
 974 static void
 975 record_conversion_result (struct coding_system *coding,
 976                           enum coding_result_code result)
 977 {
 978   coding->result = result;
 979   switch (result)
 980     {
 981     case CODING_RESULT_INSUFFICIENT_SRC:
 982       Vlast_code_conversion_error = Qinsufficient_source;
 983       break;
 984     case CODING_RESULT_INCONSISTENT_EOL:
 985       Vlast_code_conversion_error = Qinconsistent_eol;
 986       break;
 987     case CODING_RESULT_INVALID_SRC:
 988       Vlast_code_conversion_error = Qinvalid_source;
 989       break;
 990     case CODING_RESULT_INTERRUPT:
 991       Vlast_code_conversion_error = Qinterrupted;
 992       break;
 993     case CODING_RESULT_INSUFFICIENT_MEM:
 994       Vlast_code_conversion_error = Qinsufficient_memory;
 995       break;
 996     case CODING_RESULT_INSUFFICIENT_DST:
 997       /* Don't record this error in Vlast_code_conversion_error
 998          because it happens just temporarily and is resolved when the
 999          whole conversion is finished.  */
1000       break;
1001     case CODING_RESULT_SUCCESS:
1002       break;
1003     default:
1004       Vlast_code_conversion_error = intern ("Unknown error");
1005     }
1006 }
1007
1008 /* This wrapper macro is used to preserve validity of pointers into
1009    buffer text across calls to decode_char, which could cause
1010    relocation of buffers if it loads a charset map, because loading a
1011    charset map allocates large structures.  */
1012 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1013   do {                                                                       \
1014     charset_map_loaded = 0;                                                  \
1015     c = DECODE_CHAR (charset, code);                                         \
1016     if (charset_map_loaded)                                                  \
1017       {                                                                      \
1018         const unsigned char *orig = coding->source;                          \
1019         EMACS_INT offset;                                                    \
1020                                                                              \
1021         coding_set_source (coding);                                          \
1022         offset = coding->source - orig;                                      \
1023         src += offset;                                                       \
1024         src_base += offset;                                                  \
1025         src_end += offset;                                                   \
1026       }                                                                      \
1027   } while (0)
1028
1029
1030 /* If there are at least BYTES length of room at dst, allocate memory
1031    for coding->destination and update dst and dst_end.  We don't have
1032    to take care of coding->source which will be relocated.  It is
1033    handled by calling coding_set_source in encode_coding.  */
1034
1035 #define ASSURE_DESTINATION(bytes)                               \
1036   do {                                                          \
1037     if (dst + (bytes) >= dst_end)                               \
1038       {                                                         \
1039         int more_bytes = charbuf_end - charbuf + (bytes);       \
1040                                                                 \
1041         dst = alloc_destination (coding, more_bytes, dst);      \
1042         dst_end = coding->destination + coding->dst_bytes;      \
1043       }                                                         \
1044   } while (0)
1045
1046
1047 /* Store multibyte form of the character C in P, and advance P to the
1048    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1049    never calls MAYBE_UNIFY_CHAR.  */
1050
1051 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1052   do {                                          \
1053     if ((c) <= MAX_1_BYTE_CHAR)                 \
1054       *(p)++ = (c);                             \
1055     else if ((c) <= MAX_2_BYTE_CHAR)            \
1056       *(p)++ = (0xC0 | ((c) >> 6)),             \
1057         *(p)++ = (0x80 | ((c) & 0x3F));         \
1058     else if ((c) <= MAX_3_BYTE_CHAR)            \
1059       *(p)++ = (0xE0 | ((c) >> 12)),            \
1060         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1061         *(p)++ = (0x80 | ((c) & 0x3F));         \
1062     else if ((c) <= MAX_4_BYTE_CHAR)            \
1063       *(p)++ = (0xF0 | (c >> 18)),              \
1064         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1065         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1066         *(p)++ = (0x80 | (c & 0x3F));           \
1067     else if ((c) <= MAX_5_BYTE_CHAR)            \
1068       *(p)++ = 0xF8,                            \
1069         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1070         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1071         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1072         *(p)++ = (0x80 | (c & 0x3F));           \
1073     else                                        \
1074       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1075   } while (0)
1076
1077
1078 /* Return the character code of character whose multibyte form is at
1079    P, and advance P to the end of the multibyte form.  This is like
1080    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1081
1082 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1083   (!((p)[0] & 0x80)                                             \
1084    ? *(p)++                                                     \
1085    : ! ((p)[0] & 0x20)                                          \
1086    ? ((p) += 2,                                                 \
1087       ((((p)[-2] & 0x1F) << 6)                                  \
1088        | ((p)[-1] & 0x3F)                                       \
1089        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1090    : ! ((p)[0] & 0x10)                                          \
1091    ? ((p) += 3,                                                 \
1092       ((((p)[-3] & 0x0F) << 12)                                 \
1093        | (((p)[-2] & 0x3F) << 6)                                \
1094        | ((p)[-1] & 0x3F)))                                     \
1095    : ! ((p)[0] & 0x08)                                          \
1096    ? ((p) += 4,                                                 \
1097       ((((p)[-4] & 0xF) << 18)                                  \
1098        | (((p)[-3] & 0x3F) << 12)                               \
1099        | (((p)[-2] & 0x3F) << 6)                                \
1100        | ((p)[-1] & 0x3F)))                                     \
1101    : ((p) += 5,                                                 \
1102       ((((p)[-4] & 0x3F) << 18)                                 \
1103        | (((p)[-3] & 0x3F) << 12)                               \
1104        | (((p)[-2] & 0x3F) << 6)                                \
1105        | ((p)[-1] & 0x3F))))
1106
1107
1108 static void
1109 coding_set_source (coding)
1110      struct coding_system *coding;
1111 {
1112   if (BUFFERP (coding->src_object))
1113     {
1114       struct buffer *buf = XBUFFER (coding->src_object);
1115
1116       if (coding->src_pos < 0)
1117         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1118       else
1119         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1120     }
1121   else if (STRINGP (coding->src_object))
1122     {
1123       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1124     }
1125   else
1126     /* Otherwise, the source is C string and is never relocated
1127        automatically.  Thus we don't have to update anything.  */
1128     ;
1129 }
1130
1131 static void
1132 coding_set_destination (coding)
1133      struct coding_system *coding;
1134 {
1135   if (BUFFERP (coding->dst_object))
1136     {
1137       if (coding->src_pos < 0)
1138         {
1139           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1140           coding->dst_bytes = (GAP_END_ADDR
1141                                - (coding->src_bytes - coding->consumed)
1142                                - coding->destination);
1143         }
1144       else
1145         {
1146           /* We are sure that coding->dst_pos_byte is before the gap
1147              of the buffer. */
1148           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1149                                  + coding->dst_pos_byte - BEG_BYTE);
1150           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1151                                - coding->destination);
1152         }
1153     }
1154   else
1155     /* Otherwise, the destination is C string and is never relocated
1156        automatically.  Thus we don't have to update anything.  */
1157     ;
1158 }
1159
1160
1161 static void
1162 coding_alloc_by_realloc (coding, bytes)
1163      struct coding_system *coding;
1164      EMACS_INT bytes;
1165 {
1166   coding->destination = (unsigned char *) xrealloc (coding->destination,
1167                                                     coding->dst_bytes + bytes);
1168   coding->dst_bytes += bytes;
1169 }
1170
1171 static void
1172 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1173      struct coding_system *coding;
1174      EMACS_INT gap_head_used, bytes;
1175 {
1176   if (EQ (coding->src_object, coding->dst_object))
1177     {
1178       /* The gap may contain the produced data at the head and not-yet
1179          consumed data at the tail.  To preserve those data, we at
1180          first make the gap size to zero, then increase the gap
1181          size.  */
1182       EMACS_INT add = GAP_SIZE;
1183
1184       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1185       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1186       make_gap (bytes);
1187       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1188       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1189     }
1190   else
1191     {
1192       Lisp_Object this_buffer;
1193
1194       this_buffer = Fcurrent_buffer ();
1195       set_buffer_internal (XBUFFER (coding->dst_object));
1196       make_gap (bytes);
1197       set_buffer_internal (XBUFFER (this_buffer));
1198     }
1199 }
1200
1201
1202 static unsigned char *
1203 alloc_destination (coding, nbytes, dst)
1204      struct coding_system *coding;
1205      EMACS_INT nbytes;
1206      unsigned char *dst;
1207 {
1208   EMACS_INT offset = dst - coding->destination;
1209
1210   if (BUFFERP (coding->dst_object))
1211     {
1212       struct buffer *buf = XBUFFER (coding->dst_object);
1213
1214       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1215     }
1216   else
1217     coding_alloc_by_realloc (coding, nbytes);
1218   coding_set_destination (coding);
1219   dst = coding->destination + offset;
1220   return dst;
1221 }
1222
1223 /** Macros for annotations.  */
1224
1225 /* An annotation data is stored in the array coding->charbuf in this
1226    format:
1227      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1228    LENGTH is the number of elements in the annotation.
1229    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1230    NCHARS is the number of characters in the text annotated.
1231
1232    The format of the following elements depend on ANNOTATION_MASK.
1233
1234    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1235    follows:
1236      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1237
1238    NBYTES is the number of bytes specified in the header part of
1239    old-style emacs-mule encoding, or 0 for the other kind of
1240    composition.
1241
1242    METHOD is one of enum composition_method.
1243
1244    Optionnal COMPOSITION-COMPONENTS are characters and composition
1245    rules.
1246
1247    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1248    follows.
1249
1250    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1251    recover from an invalid annotation, and should be skipped by
1252    produce_annotation.  */
1253
1254 /* Maximum length of the header of annotation data.  */
1255 #define MAX_ANNOTATION_LENGTH 5
1256
1257 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1258   do {                                                  \
1259     *(buf)++ = -(len);                                  \
1260     *(buf)++ = (mask);                                  \
1261     *(buf)++ = (nchars);                                \
1262     coding->annotated = 1;                              \
1263   } while (0);
1264
1265 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1266   do {                                                                      \
1267     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1268     *buf++ = nbytes;                                                        \
1269     *buf++ = method;                                                        \
1270   } while (0)
1271
1272
1273 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1274   do {                                                                  \
1275     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1276     *buf++ = id;                                                        \
1277   } while (0)
1278
1279 \f
1280 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1281
1282
1283
1284 \f
1285 /*** 3. UTF-8 ***/
1286
1287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1288    Check if a text is encoded in UTF-8.  If it is, return 1, else
1289    return 0.  */
1290
1291 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1292 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1293 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1294 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1295 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1296 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1297
1298 #define UTF_BOM 0xFEFF
1299 #define UTF_8_BOM_1 0xEF
1300 #define UTF_8_BOM_2 0xBB
1301 #define UTF_8_BOM_3 0xBF
1302
1303 static int
1304 detect_coding_utf_8 (coding, detect_info)
1305      struct coding_system *coding;
1306      struct coding_detection_info *detect_info;
1307 {
1308   const unsigned char *src = coding->source, *src_base;
1309   const unsigned char *src_end = coding->source + coding->src_bytes;
1310   int multibytep = coding->src_multibyte;
1311   int consumed_chars = 0;
1312   int bom_found = 0;
1313   int found = 0;
1314
1315   detect_info->checked |= CATEGORY_MASK_UTF_8;
1316   /* A coding system of this category is always ASCII compatible.  */
1317   src += coding->head_ascii;
1318
1319   while (1)
1320     {
1321       int c, c1, c2, c3, c4;
1322
1323       src_base = src;
1324       ONE_MORE_BYTE (c);
1325       if (c < 0 || UTF_8_1_OCTET_P (c))
1326         continue;
1327       ONE_MORE_BYTE (c1);
1328       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1329         break;
1330       if (UTF_8_2_OCTET_LEADING_P (c))
1331         {
1332           found = 1;
1333           continue;
1334         }
1335       ONE_MORE_BYTE (c2);
1336       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1337         break;
1338       if (UTF_8_3_OCTET_LEADING_P (c))
1339         {
1340           found = 1;
1341           if (src_base == coding->source
1342               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1343             bom_found = 1;
1344           continue;
1345         }
1346       ONE_MORE_BYTE (c3);
1347       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1348         break;
1349       if (UTF_8_4_OCTET_LEADING_P (c))
1350         {
1351           found = 1;
1352           continue;
1353         }
1354       ONE_MORE_BYTE (c4);
1355       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1356         break;
1357       if (UTF_8_5_OCTET_LEADING_P (c))
1358         {
1359           found = 1;
1360           continue;
1361         }
1362       break;
1363     }
1364   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1365   return 0;
1366
1367  no_more_source:
1368   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1369     {
1370       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1371       return 0;
1372     }
1373   if (bom_found)
1374     {
1375       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1376       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1377     }
1378   else
1379     {
1380       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1381       if (found)
1382         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1383     }
1384   return 1;
1385 }
1386
1387
1388 static void
1389 decode_coding_utf_8 (coding)
1390      struct coding_system *coding;
1391 {
1392   const unsigned char *src = coding->source + coding->consumed;
1393   const unsigned char *src_end = coding->source + coding->src_bytes;
1394   const unsigned char *src_base;
1395   int *charbuf = coding->charbuf + coding->charbuf_used;
1396   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1397   int consumed_chars = 0, consumed_chars_base = 0;
1398   int multibytep = coding->src_multibyte;
1399   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1400   Lisp_Object attr, charset_list;
1401   int eol_crlf =
1402     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1403   int byte_after_cr = -1;
1404
1405   CODING_GET_INFO (coding, attr, charset_list);
1406
1407   if (bom != utf_without_bom)
1408     {
1409       int c1, c2, c3;
1410
1411       src_base = src;
1412       ONE_MORE_BYTE (c1);
1413       if (! UTF_8_3_OCTET_LEADING_P (c1))
1414         src = src_base;
1415       else
1416         {
1417           ONE_MORE_BYTE (c2);
1418           if (! UTF_8_EXTRA_OCTET_P (c2))
1419             src = src_base;
1420           else
1421             {
1422               ONE_MORE_BYTE (c3);
1423               if (! UTF_8_EXTRA_OCTET_P (c3))
1424                 src = src_base;
1425               else
1426                 {
1427                   if ((c1 != UTF_8_BOM_1)
1428                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1429                     src = src_base;
1430                   else
1431                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1432                 }
1433             }
1434         }
1435     }
1436   CODING_UTF_8_BOM (coding) = utf_without_bom;
1437
1438
1439
1440   while (1)
1441     {
1442       int c, c1, c2, c3, c4, c5;
1443
1444       src_base = src;
1445       consumed_chars_base = consumed_chars;
1446
1447       if (charbuf >= charbuf_end)
1448         {
1449           if (byte_after_cr >= 0)
1450             src_base--;
1451           break;
1452         }
1453
1454       if (byte_after_cr >= 0)
1455         c1 = byte_after_cr, byte_after_cr = -1;
1456       else
1457         ONE_MORE_BYTE (c1);
1458       if (c1 < 0)
1459         {
1460           c = - c1;
1461         }
1462       else if (UTF_8_1_OCTET_P(c1))
1463         {
1464           if (eol_crlf && c1 == '\r')
1465             ONE_MORE_BYTE (byte_after_cr);
1466           c = c1;
1467         }
1468       else
1469         {
1470           ONE_MORE_BYTE (c2);
1471           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1472             goto invalid_code;
1473           if (UTF_8_2_OCTET_LEADING_P (c1))
1474             {
1475               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1476               /* Reject overlong sequences here and below.  Encoders
1477                  producing them are incorrect, they can be misleading,
1478                  and they mess up read/write invariance.  */
1479               if (c < 128)
1480                 goto invalid_code;
1481             }
1482           else
1483             {
1484               ONE_MORE_BYTE (c3);
1485               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1486                 goto invalid_code;
1487               if (UTF_8_3_OCTET_LEADING_P (c1))
1488                 {
1489                   c = (((c1 & 0xF) << 12)
1490                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1491                   if (c < 0x800
1492                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1493                     goto invalid_code;
1494                 }
1495               else
1496                 {
1497                   ONE_MORE_BYTE (c4);
1498                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1499                     goto invalid_code;
1500                   if (UTF_8_4_OCTET_LEADING_P (c1))
1501                     {
1502                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1503                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1504                     if (c < 0x10000)
1505                       goto invalid_code;
1506                     }
1507                   else
1508                     {
1509                       ONE_MORE_BYTE (c5);
1510                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1511                         goto invalid_code;
1512                       if (UTF_8_5_OCTET_LEADING_P (c1))
1513                         {
1514                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1515                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1516                                | (c5 & 0x3F));
1517                           if ((c > MAX_CHAR) || (c < 0x200000))
1518                             goto invalid_code;
1519                         }
1520                       else
1521                         goto invalid_code;
1522                     }
1523                 }
1524             }
1525         }
1526
1527       *charbuf++ = c;
1528       continue;
1529
1530     invalid_code:
1531       src = src_base;
1532       consumed_chars = consumed_chars_base;
1533       ONE_MORE_BYTE (c);
1534       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1535       coding->errors++;
1536     }
1537
1538  no_more_source:
1539   coding->consumed_char += consumed_chars_base;
1540   coding->consumed = src_base - coding->source;
1541   coding->charbuf_used = charbuf - coding->charbuf;
1542 }
1543
1544
1545 static int
1546 encode_coding_utf_8 (coding)
1547      struct coding_system *coding;
1548 {
1549   int multibytep = coding->dst_multibyte;
1550   int *charbuf = coding->charbuf;
1551   int *charbuf_end = charbuf + coding->charbuf_used;
1552   unsigned char *dst = coding->destination + coding->produced;
1553   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1554   int produced_chars = 0;
1555   int c;
1556
1557   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1558     {
1559       ASSURE_DESTINATION (3);
1560       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1561       CODING_UTF_8_BOM (coding) = utf_without_bom;
1562     }
1563
1564   if (multibytep)
1565     {
1566       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1567
1568       while (charbuf < charbuf_end)
1569         {
1570           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1571
1572           ASSURE_DESTINATION (safe_room);
1573           c = *charbuf++;
1574           if (CHAR_BYTE8_P (c))
1575             {
1576               c = CHAR_TO_BYTE8 (c);
1577               EMIT_ONE_BYTE (c);
1578             }
1579           else
1580             {
1581               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1582               for (p = str; p < pend; p++)
1583                 EMIT_ONE_BYTE (*p);
1584             }
1585         }
1586     }
1587   else
1588     {
1589       int safe_room = MAX_MULTIBYTE_LENGTH;
1590
1591       while (charbuf < charbuf_end)
1592         {
1593           ASSURE_DESTINATION (safe_room);
1594           c = *charbuf++;
1595           if (CHAR_BYTE8_P (c))
1596             *dst++ = CHAR_TO_BYTE8 (c);
1597           else
1598             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1599           produced_chars++;
1600         }
1601     }
1602   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1603   coding->produced_char += produced_chars;
1604   coding->produced = dst - coding->destination;
1605   return 0;
1606 }
1607
1608
1609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1610    Check if a text is encoded in one of UTF-16 based coding systems.
1611    If it is, return 1, else return 0.  */
1612
1613 #define UTF_16_HIGH_SURROGATE_P(val) \
1614   (((val) & 0xFC00) == 0xD800)
1615
1616 #define UTF_16_LOW_SURROGATE_P(val) \
1617   (((val) & 0xFC00) == 0xDC00)
1618
1619 #define UTF_16_INVALID_P(val)   \
1620   (((val) == 0xFFFE)            \
1621    || ((val) == 0xFFFF)         \
1622    || UTF_16_LOW_SURROGATE_P (val))
1623
1624
1625 static int
1626 detect_coding_utf_16 (coding, detect_info)
1627      struct coding_system *coding;
1628      struct coding_detection_info *detect_info;
1629 {
1630   const unsigned char *src = coding->source, *src_base = src;
1631   const unsigned char *src_end = coding->source + coding->src_bytes;
1632   int multibytep = coding->src_multibyte;
1633   int consumed_chars = 0;
1634   int c1, c2;
1635
1636   detect_info->checked |= CATEGORY_MASK_UTF_16;
1637   if (coding->mode & CODING_MODE_LAST_BLOCK
1638       && (coding->src_chars & 1))
1639     {
1640       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1641       return 0;
1642     }
1643
1644   TWO_MORE_BYTES (c1, c2);
1645   if ((c1 == 0xFF) && (c2 == 0xFE))
1646     {
1647       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1648                              | CATEGORY_MASK_UTF_16_AUTO);
1649       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1650                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1651                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1652     }
1653   else if ((c1 == 0xFE) && (c2 == 0xFF))
1654     {
1655       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1656                              | CATEGORY_MASK_UTF_16_AUTO);
1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1658                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1659                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1660     }
1661   else if (c2 < 0)
1662     {
1663       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1664       return 0;
1665     }
1666   else
1667     {
1668       /* We check the dispersion of Eth and Oth bytes where E is even and
1669          O is odd.  If both are high, we assume binary data.*/
1670       unsigned char e[256], o[256];
1671       unsigned e_num = 1, o_num = 1;
1672
1673       memset (e, 0, 256);
1674       memset (o, 0, 256);
1675       e[c1] = 1;
1676       o[c2] = 1;
1677
1678       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1679                                 |CATEGORY_MASK_UTF_16_BE
1680                                 | CATEGORY_MASK_UTF_16_LE);
1681
1682       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1683              != CATEGORY_MASK_UTF_16)
1684         {
1685           TWO_MORE_BYTES (c1, c2);
1686           if (c2 < 0)
1687             break;
1688           if (! e[c1])
1689             {
1690               e[c1] = 1;
1691               e_num++;
1692               if (e_num >= 128)
1693                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1694             }
1695           if (! o[c2])
1696             {
1697               o[c2] = 1;
1698               o_num++;
1699               if (o_num >= 128)
1700                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1701             }
1702         }
1703       return 0;
1704     }
1705
1706  no_more_source:
1707   return 1;
1708 }
1709
1710 static void
1711 decode_coding_utf_16 (coding)
1712      struct coding_system *coding;
1713 {
1714   const unsigned char *src = coding->source + coding->consumed;
1715   const unsigned char *src_end = coding->source + coding->src_bytes;
1716   const unsigned char *src_base;
1717   int *charbuf = coding->charbuf + coding->charbuf_used;
1718   /* We may produces at most 3 chars in one loop.  */
1719   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1720   int consumed_chars = 0, consumed_chars_base = 0;
1721   int multibytep = coding->src_multibyte;
1722   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1723   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1724   int surrogate = CODING_UTF_16_SURROGATE (coding);
1725   Lisp_Object attr, charset_list;
1726   int eol_crlf =
1727     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1728   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1729
1730   CODING_GET_INFO (coding, attr, charset_list);
1731
1732   if (bom == utf_with_bom)
1733     {
1734       int c, c1, c2;
1735
1736       src_base = src;
1737       ONE_MORE_BYTE (c1);
1738       ONE_MORE_BYTE (c2);
1739       c = (c1 << 8) | c2;
1740
1741       if (endian == utf_16_big_endian
1742           ? c != 0xFEFF : c != 0xFFFE)
1743         {
1744           /* The first two bytes are not BOM.  Treat them as bytes
1745              for a normal character.  */
1746           src = src_base;
1747           coding->errors++;
1748         }
1749       CODING_UTF_16_BOM (coding) = utf_without_bom;
1750     }
1751   else if (bom == utf_detect_bom)
1752     {
1753       /* We have already tried to detect BOM and failed in
1754          detect_coding.  */
1755       CODING_UTF_16_BOM (coding) = utf_without_bom;
1756     }
1757
1758   while (1)
1759     {
1760       int c, c1, c2;
1761
1762       src_base = src;
1763       consumed_chars_base = consumed_chars;
1764
1765       if (charbuf >= charbuf_end)
1766         {
1767           if (byte_after_cr1 >= 0)
1768             src_base -= 2;
1769           break;
1770         }
1771
1772       if (byte_after_cr1 >= 0)
1773         c1 = byte_after_cr1, byte_after_cr1 = -1;
1774       else
1775         ONE_MORE_BYTE (c1);
1776       if (c1 < 0)
1777         {
1778           *charbuf++ = -c1;
1779           continue;
1780         }
1781       if (byte_after_cr2 >= 0)
1782         c2 = byte_after_cr2, byte_after_cr2 = -1;
1783       else
1784         ONE_MORE_BYTE (c2);
1785       if (c2 < 0)
1786         {
1787           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1788           *charbuf++ = -c2;
1789           continue;
1790         }
1791       c = (endian == utf_16_big_endian
1792            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1793
1794       if (surrogate)
1795         {
1796           if (! UTF_16_LOW_SURROGATE_P (c))
1797             {
1798               if (endian == utf_16_big_endian)
1799                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1800               else
1801                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1802               *charbuf++ = c1;
1803               *charbuf++ = c2;
1804               coding->errors++;
1805               if (UTF_16_HIGH_SURROGATE_P (c))
1806                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1807               else
1808                 *charbuf++ = c;
1809             }
1810           else
1811             {
1812               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1813               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1814               *charbuf++ = 0x10000 + c;
1815             }
1816         }
1817       else
1818         {
1819           if (UTF_16_HIGH_SURROGATE_P (c))
1820             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1821           else
1822             {
1823               if (eol_crlf && c == '\r')
1824                 {
1825                   ONE_MORE_BYTE (byte_after_cr1);
1826                   ONE_MORE_BYTE (byte_after_cr2);
1827                 }
1828               *charbuf++ = c;
1829             }
1830         }
1831     }
1832
1833  no_more_source:
1834   coding->consumed_char += consumed_chars_base;
1835   coding->consumed = src_base - coding->source;
1836   coding->charbuf_used = charbuf - coding->charbuf;
1837 }
1838
1839 static int
1840 encode_coding_utf_16 (coding)
1841      struct coding_system *coding;
1842 {
1843   int multibytep = coding->dst_multibyte;
1844   int *charbuf = coding->charbuf;
1845   int *charbuf_end = charbuf + coding->charbuf_used;
1846   unsigned char *dst = coding->destination + coding->produced;
1847   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1848   int safe_room = 8;
1849   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1850   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1851   int produced_chars = 0;
1852   Lisp_Object attrs, charset_list;
1853   int c;
1854
1855   CODING_GET_INFO (coding, attrs, charset_list);
1856
1857   if (bom != utf_without_bom)
1858     {
1859       ASSURE_DESTINATION (safe_room);
1860       if (big_endian)
1861         EMIT_TWO_BYTES (0xFE, 0xFF);
1862       else
1863         EMIT_TWO_BYTES (0xFF, 0xFE);
1864       CODING_UTF_16_BOM (coding) = utf_without_bom;
1865     }
1866
1867   while (charbuf < charbuf_end)
1868     {
1869       ASSURE_DESTINATION (safe_room);
1870       c = *charbuf++;
1871       if (c > MAX_UNICODE_CHAR)
1872         c = coding->default_char;
1873
1874       if (c < 0x10000)
1875         {
1876           if (big_endian)
1877             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1878           else
1879             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1880         }
1881       else
1882         {
1883           int c1, c2;
1884
1885           c -= 0x10000;
1886           c1 = (c >> 10) + 0xD800;
1887           c2 = (c & 0x3FF) + 0xDC00;
1888           if (big_endian)
1889             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1890           else
1891             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1892         }
1893     }
1894   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1895   coding->produced = dst - coding->destination;
1896   coding->produced_char += produced_chars;
1897   return 0;
1898 }
1899
1900 \f
1901 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1902
1903 /* Emacs' internal format for representation of multiple character
1904    sets is a kind of multi-byte encoding, i.e. characters are
1905    represented by variable-length sequences of one-byte codes.
1906
1907    ASCII characters and control characters (e.g. `tab', `newline') are
1908    represented by one-byte sequences which are their ASCII codes, in
1909    the range 0x00 through 0x7F.
1910
1911    8-bit characters of the range 0x80..0x9F are represented by
1912    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1913    code + 0x20).
1914
1915    8-bit characters of the range 0xA0..0xFF are represented by
1916    one-byte sequences which are their 8-bit code.
1917
1918    The other characters are represented by a sequence of `base
1919    leading-code', optional `extended leading-code', and one or two
1920    `position-code's.  The length of the sequence is determined by the
1921    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1922    whereas extended leading-code and position-code take the range 0xA0
1923    through 0xFF.  See `charset.h' for more details about leading-code
1924    and position-code.
1925
1926    --- CODE RANGE of Emacs' internal format ---
1927    character set        range
1928    -------------        -----
1929    ascii                0x00..0x7F
1930    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1931    eight-bit-graphic    0xA0..0xBF
1932    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1933    ---------------------------------------------
1934
1935    As this is the internal character representation, the format is
1936    usually not used externally (i.e. in a file or in a data sent to a
1937    process).  But, it is possible to have a text externally in this
1938    format (i.e. by encoding by the coding system `emacs-mule').
1939
1940    In that case, a sequence of one-byte codes has a slightly different
1941    form.
1942
1943    At first, all characters in eight-bit-control are represented by
1944    one-byte sequences which are their 8-bit code.
1945
1946    Next, character composition data are represented by the byte
1947    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1948    where,
1949         METHOD is 0xF2 plus one of composition method (enum
1950         composition_method),
1951
1952         BYTES is 0xA0 plus a byte length of this composition data,
1953
1954         CHARS is 0xA0 plus a number of characters composed by this
1955         data,
1956
1957         COMPONENTs are characters of multibye form or composition
1958         rules encoded by two-byte of ASCII codes.
1959
1960    In addition, for backward compatibility, the following formats are
1961    also recognized as composition data on decoding.
1962
1963    0x80 MSEQ ...
1964    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1965
1966    Here,
1967         MSEQ is a multibyte form but in these special format:
1968           ASCII: 0xA0 ASCII_CODE+0x80,
1969           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1970         RULE is a one byte code of the range 0xA0..0xF0 that
1971         represents a composition rule.
1972   */
1973
1974 char emacs_mule_bytes[256];
1975
1976
1977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1978    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1979    else return 0.  */
1980
1981 static int
1982 detect_coding_emacs_mule (coding, detect_info)
1983      struct coding_system *coding;
1984      struct coding_detection_info *detect_info;
1985 {
1986   const unsigned char *src = coding->source, *src_base;
1987   const unsigned char *src_end = coding->source + coding->src_bytes;
1988   int multibytep = coding->src_multibyte;
1989   int consumed_chars = 0;
1990   int c;
1991   int found = 0;
1992
1993   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1994   /* A coding system of this category is always ASCII compatible.  */
1995   src += coding->head_ascii;
1996
1997   while (1)
1998     {
1999       src_base = src;
2000       ONE_MORE_BYTE (c);
2001       if (c < 0)
2002         continue;
2003       if (c == 0x80)
2004         {
2005           /* Perhaps the start of composite character.  We simply skip
2006              it because analyzing it is too heavy for detecting.  But,
2007              at least, we check that the composite character
2008              constitutes of more than 4 bytes.  */
2009           const unsigned char *src_base;
2010
2011         repeat:
2012           src_base = src;
2013           do
2014             {
2015               ONE_MORE_BYTE (c);
2016             }
2017           while (c >= 0xA0);
2018
2019           if (src - src_base <= 4)
2020             break;
2021           found = CATEGORY_MASK_EMACS_MULE;
2022           if (c == 0x80)
2023             goto repeat;
2024         }
2025
2026       if (c < 0x80)
2027         {
2028           if (c < 0x20
2029               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2030             break;
2031         }
2032       else
2033         {
2034           int more_bytes = emacs_mule_bytes[c] - 1;
2035
2036           while (more_bytes > 0)
2037             {
2038               ONE_MORE_BYTE (c);
2039               if (c < 0xA0)
2040                 {
2041                   src--;        /* Unread the last byte.  */
2042                   break;
2043                 }
2044               more_bytes--;
2045             }
2046           if (more_bytes != 0)
2047             break;
2048           found = CATEGORY_MASK_EMACS_MULE;
2049         }
2050     }
2051   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2052   return 0;
2053
2054  no_more_source:
2055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2056     {
2057       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2058       return 0;
2059     }
2060   detect_info->found |= found;
2061   return 1;
2062 }
2063
2064
2065 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2066    character.  If CMP_STATUS indicates that we must expect MSEQ or
2067    RULE described above, decode it and return the negative value of
2068    the decoded character or rule.  If an invalid byte is found, return
2069    -1.  If SRC is too short, return -2.  */
2070
2071 int
2072 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2073      struct coding_system *coding;
2074      const unsigned char *src;
2075      int *nbytes, *nchars, *id;
2076      struct composition_status *cmp_status;
2077 {
2078   const unsigned char *src_end = coding->source + coding->src_bytes;
2079   const unsigned char *src_base = src;
2080   int multibytep = coding->src_multibyte;
2081   struct charset *charset;
2082   unsigned code;
2083   int c;
2084   int consumed_chars = 0;
2085   int mseq_found = 0;
2086
2087   ONE_MORE_BYTE (c);
2088   if (c < 0)
2089     {
2090       c = -c;
2091       charset = emacs_mule_charset[0];
2092     }
2093   else
2094     {
2095       if (c >= 0xA0)
2096         {
2097           if (cmp_status->state != COMPOSING_NO
2098               && cmp_status->old_form)
2099             {
2100               if (cmp_status->state == COMPOSING_CHAR)
2101                 {
2102                   if (c == 0xA0)
2103                     {
2104                       ONE_MORE_BYTE (c);
2105                       c -= 0x80;
2106                       if (c < 0)
2107                         goto invalid_code;
2108                     }
2109                   else
2110                     c -= 0x20;
2111                   mseq_found = 1;
2112                 }
2113               else
2114                 {
2115                   *nbytes = src - src_base;
2116                   *nchars = consumed_chars;
2117                   return -c;
2118                 }
2119             }
2120           else
2121             goto invalid_code;
2122         }
2123
2124       switch (emacs_mule_bytes[c])
2125         {
2126         case 2:
2127           if (! (charset = emacs_mule_charset[c]))
2128             goto invalid_code;
2129           ONE_MORE_BYTE (c);
2130           if (c < 0xA0)
2131             goto invalid_code;
2132           code = c & 0x7F;
2133           break;
2134
2135         case 3:
2136           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2137               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2138             {
2139               ONE_MORE_BYTE (c);
2140               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2141                 goto invalid_code;
2142               ONE_MORE_BYTE (c);
2143               if (c < 0xA0)
2144                 goto invalid_code;
2145               code = c & 0x7F;
2146             }
2147           else
2148             {
2149               if (! (charset = emacs_mule_charset[c]))
2150                 goto invalid_code;
2151               ONE_MORE_BYTE (c);
2152               if (c < 0xA0)
2153                 goto invalid_code;
2154               code = (c & 0x7F) << 8;
2155               ONE_MORE_BYTE (c);
2156               if (c < 0xA0)
2157                 goto invalid_code;
2158               code |= c & 0x7F;
2159             }
2160           break;
2161
2162         case 4:
2163           ONE_MORE_BYTE (c);
2164           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2165             goto invalid_code;
2166           ONE_MORE_BYTE (c);
2167           if (c < 0xA0)
2168             goto invalid_code;
2169           code = (c & 0x7F) << 8;
2170           ONE_MORE_BYTE (c);
2171           if (c < 0xA0)
2172             goto invalid_code;
2173           code |= c & 0x7F;
2174           break;
2175
2176         case 1:
2177           code = c;
2178           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2179                                      ? charset_ascii : charset_eight_bit);
2180           break;
2181
2182         default:
2183           abort ();
2184         }
2185       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, code, c);
2186       if (c < 0)
2187         goto invalid_code;
2188     }
2189   *nbytes = src - src_base;
2190   *nchars = consumed_chars;
2191   if (id)
2192     *id = charset->id;
2193   return (mseq_found ? -c : c);
2194
2195  no_more_source:
2196   return -2;
2197
2198  invalid_code:
2199   return -1;
2200 }
2201
2202
2203 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2204
2205 /* Handle these composition sequence ('|': the end of header elements,
2206    BYTES and CHARS >= 0xA0):
2207
2208    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2209    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2210    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2211
2212    and these old form:
2213
2214    (4) relative composition: 0x80 | MSEQ ... MSEQ
2215    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2216
2217    When the starter 0x80 and the following header elements are found,
2218    this annotation header is produced.
2219
2220         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2221
2222    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2223    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2224
2225    Then, upon reading the following elements, these codes are produced
2226    until the composition end is found:
2227
2228    (1) CHAR ... CHAR
2229    (2) ALT ... ALT CHAR ... CHAR
2230    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2231    (4) CHAR ... CHAR
2232    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2233
2234    When the composition end is found, LENGTH and NCHARS in the
2235    annotation header is updated as below:
2236
2237    (1) LENGTH: unchanged, NCHARS: unchanged
2238    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2239    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2240    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2241    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2242
2243    If an error is found while composing, the annotation header is
2244    changed to the original composition header (plus filler -1s) as
2245    below:
2246
2247    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2248    (5)          [ 0x80 0xFF -1 -1- -1 ]
2249
2250    and the sequence [ -2 DECODED-RULE ] is changed to the original
2251    byte sequence as below:
2252         o the original byte sequence is B: [ B -1 ]
2253         o the original byte sequence is B1 B2: [ B1 B2 ]
2254
2255    Most of the routines are implemented by macros because many
2256    variables and labels in the caller decode_coding_emacs_mule must be
2257    accessible, and they are usually called just once (thus doesn't
2258    increase the size of compiled object).  */
2259
2260 /* Decode a composition rule represented by C as a component of
2261    composition sequence of Emacs 20 style.  Set RULE to the decoded
2262    rule. */
2263
2264 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2265   do {                                                  \
2266     int gref, nref;                                     \
2267                                                         \
2268     c -= 0xA0;                                          \
2269     if (c < 0 || c >= 81)                               \
2270       goto invalid_code;                                \
2271     gref = c / 9, nref = c % 9;                         \
2272     if (gref == 4) gref = 10;                           \
2273     if (nref == 4) nref = 10;                           \
2274     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2275   } while (0)
2276
2277
2278 /* Decode a composition rule represented by C and the following byte
2279    at SRC as a component of composition sequence of Emacs 21 style.
2280    Set RULE to the decoded rule.  */
2281
2282 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2283   do {                                                  \
2284     int gref, nref;                                     \
2285                                                         \
2286     gref = c - 0x20;                                    \
2287     if (gref < 0 || gref >= 81)                         \
2288       goto invalid_code;                                \
2289     ONE_MORE_BYTE (c);                                  \
2290     nref = c - 0x20;                                    \
2291     if (nref < 0 || nref >= 81)                         \
2292       goto invalid_code;                                \
2293     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2294   } while (0)
2295
2296
2297 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2298    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2299    byte length of this composition information, CHARS is the number of
2300    characters composed by this composition.  */
2301
2302 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2303   do {                                                                  \
2304     enum composition_method method = c - 0xF2;                          \
2305     int *charbuf_base = charbuf;                                        \
2306     int nbytes, nchars;                                                 \
2307                                                                         \
2308     ONE_MORE_BYTE (c);                                                  \
2309     if (c < 0)                                                          \
2310       goto invalid_code;                                                \
2311     nbytes = c - 0xA0;                                                  \
2312     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2313       goto invalid_code;                                                \
2314     ONE_MORE_BYTE (c);                                                  \
2315     nchars = c - 0xA0;                                                  \
2316     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2317       goto invalid_code;                                                \
2318     cmp_status->old_form = 0;                                           \
2319     cmp_status->method = method;                                        \
2320     if (method == COMPOSITION_RELATIVE)                                 \
2321       cmp_status->state = COMPOSING_CHAR;                               \
2322     else                                                                \
2323       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2325     cmp_status->nchars = nchars;                                        \
2326     cmp_status->ncomps = nbytes - 4;                                    \
2327     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2328   } while (0)
2329
2330
2331 /* Start of Emacs 20 style format for relative composition.  */
2332
2333 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2334   do {                                                          \
2335     cmp_status->old_form = 1;                                   \
2336     cmp_status->method = COMPOSITION_RELATIVE;                  \
2337     cmp_status->state = COMPOSING_CHAR;                         \
2338     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2339     cmp_status->nchars = cmp_status->ncomps = 0;                \
2340     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2341   } while (0)
2342
2343
2344 /* Start of Emacs 20 style format for rule-base composition.  */
2345
2346 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2347   do {                                                          \
2348     cmp_status->old_form = 1;                                   \
2349     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2350     cmp_status->state = COMPOSING_CHAR;                         \
2351     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2352     cmp_status->nchars = cmp_status->ncomps = 0;                \
2353     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2354   } while (0)
2355
2356
2357 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2358   do {                                                  \
2359     const unsigned char *current_src = src;             \
2360                                                         \
2361     ONE_MORE_BYTE (c);                                  \
2362     if (c < 0)                                          \
2363       goto invalid_code;                                \
2364     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2365         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2366       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2367     else if (c < 0xA0)                                  \
2368       goto invalid_code;                                \
2369     else if (c < 0xC0)                                  \
2370       {                                                 \
2371         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2372         /* Re-read C as a composition component.  */    \
2373         src = current_src;                              \
2374       }                                                 \
2375     else if (c == 0xFF)                                 \
2376       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2377     else                                                \
2378       goto invalid_code;                                \
2379   } while (0)
2380
2381 #define EMACS_MULE_COMPOSITION_END()                            \
2382   do {                                                          \
2383     int idx = - cmp_status->length;                             \
2384                                                                 \
2385     if (cmp_status->old_form)                                   \
2386       charbuf[idx + 2] = cmp_status->nchars;                    \
2387     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2388       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2389     cmp_status->state = COMPOSING_NO;                           \
2390   } while (0)
2391
2392
2393 static int
2394 emacs_mule_finish_composition (charbuf, cmp_status)
2395      int *charbuf;
2396      struct composition_status *cmp_status;
2397 {
2398   int idx = - cmp_status->length;
2399   int new_chars;
2400
2401   if (cmp_status->old_form && cmp_status->nchars > 0)
2402     {
2403       charbuf[idx + 2] = cmp_status->nchars;
2404       new_chars = 0;
2405       if (cmp_status->method == COMPOSITION_WITH_RULE
2406           && cmp_status->state == COMPOSING_CHAR)
2407         {
2408           /* The last rule was invalid.  */
2409           int rule = charbuf[-1] + 0xA0;
2410
2411           charbuf[-2] = BYTE8_TO_CHAR (rule);
2412           charbuf[-1] = -1;
2413           new_chars = 1;
2414         }
2415     }
2416   else
2417     {
2418       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2419
2420       if (cmp_status->method == COMPOSITION_WITH_RULE)
2421         {
2422           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2423           charbuf[idx++] = -3;
2424           charbuf[idx++] = 0;
2425           new_chars = 1;
2426         }
2427       else
2428         {
2429           int nchars = charbuf[idx + 1] + 0xA0;
2430           int nbytes = charbuf[idx + 2] + 0xA0;
2431
2432           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2433           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2434           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2435           charbuf[idx++] = -1;
2436           new_chars = 4;
2437         }
2438     }
2439   cmp_status->state = COMPOSING_NO;
2440   return new_chars;
2441 }
2442
2443 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2444   do {                                                                    \
2445     if (cmp_status->state != COMPOSING_NO)                                \
2446       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2447   } while (0)
2448
2449
2450 static void
2451 decode_coding_emacs_mule (coding)
2452      struct coding_system *coding;
2453 {
2454   const unsigned char *src = coding->source + coding->consumed;
2455   const unsigned char *src_end = coding->source + coding->src_bytes;
2456   const unsigned char *src_base;
2457   int *charbuf = coding->charbuf + coding->charbuf_used;
2458   /* We may produce two annocations (charset and composition) in one
2459      loop and one more charset annocation at the end.  */
2460   int *charbuf_end
2461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2462   int consumed_chars = 0, consumed_chars_base;
2463   int multibytep = coding->src_multibyte;
2464   Lisp_Object attrs, charset_list;
2465   int char_offset = coding->produced_char;
2466   int last_offset = char_offset;
2467   int last_id = charset_ascii;
2468   int eol_crlf =
2469     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2470   int byte_after_cr = -1;
2471   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2472
2473   CODING_GET_INFO (coding, attrs, charset_list);
2474
2475   if (cmp_status->state != COMPOSING_NO)
2476     {
2477       int i;
2478
2479       for (i = 0; i < cmp_status->length; i++)
2480         *charbuf++ = cmp_status->carryover[i];
2481       coding->annotated = 1;
2482     }
2483
2484   while (1)
2485     {
2486       int c, id;
2487
2488       src_base = src;
2489       consumed_chars_base = consumed_chars;
2490
2491       if (charbuf >= charbuf_end)
2492         {
2493           if (byte_after_cr >= 0)
2494             src_base--;
2495           break;
2496         }
2497
2498       if (byte_after_cr >= 0)
2499         c = byte_after_cr, byte_after_cr = -1;
2500       else
2501         ONE_MORE_BYTE (c);
2502
2503       if (c < 0 || c == 0x80)
2504         {
2505           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2506           if (c < 0)
2507             {
2508               *charbuf++ = -c;
2509               char_offset++;
2510             }
2511           else
2512             DECODE_EMACS_MULE_COMPOSITION_START ();
2513           continue;
2514         }
2515
2516       if (c < 0x80)
2517         {
2518           if (eol_crlf && c == '\r')
2519             ONE_MORE_BYTE (byte_after_cr);
2520           id = charset_ascii;
2521           if (cmp_status->state != COMPOSING_NO)
2522             {
2523               if (cmp_status->old_form)
2524                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2525               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2526                 cmp_status->ncomps--;
2527             }
2528         }
2529       else
2530         {
2531           int nchars, nbytes;
2532           /* emacs_mule_char can load a charset map from a file, which
2533              allocates a large structure and might cause buffer text
2534              to be relocated as result.  Thus, we need to remember the
2535              original pointer to buffer text, and fixup all related
2536              pointers after the call.  */
2537           const unsigned char *orig = coding->source;
2538           EMACS_INT offset;
2539
2540           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2541                                cmp_status);
2542           offset = coding->source - orig;
2543           if (offset)
2544             {
2545               src += offset;
2546               src_base += offset;
2547               src_end += offset;
2548             }
2549           if (c < 0)
2550             {
2551               if (c == -1)
2552                 goto invalid_code;
2553               if (c == -2)
2554                 break;
2555             }
2556           src = src_base + nbytes;
2557           consumed_chars = consumed_chars_base + nchars;
2558           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2559             cmp_status->ncomps -= nchars;
2560         }
2561
2562       /* Now if C >= 0, we found a normally encoded characer, if C <
2563          0, we found an old-style composition component character or
2564          rule.  */
2565
2566       if (cmp_status->state == COMPOSING_NO)
2567         {
2568           if (last_id != id)
2569             {
2570               if (last_id != charset_ascii)
2571                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2572                                   last_id);
2573               last_id = id;
2574               last_offset = char_offset;
2575             }
2576           *charbuf++ = c;
2577           char_offset++;
2578         }
2579       else if (cmp_status->state == COMPOSING_CHAR)
2580         {
2581           if (cmp_status->old_form)
2582             {
2583               if (c >= 0)
2584                 {
2585                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2586                   *charbuf++ = c;
2587                   char_offset++;
2588                 }
2589               else
2590                 {
2591                   *charbuf++ = -c;
2592                   cmp_status->nchars++;
2593                   cmp_status->length++;
2594                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2595                     EMACS_MULE_COMPOSITION_END ();
2596                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2597                     cmp_status->state = COMPOSING_RULE;
2598                 }
2599             }
2600           else
2601             {
2602               *charbuf++ = c;
2603               cmp_status->length++;
2604               cmp_status->nchars--;
2605               if (cmp_status->nchars == 0)
2606                 EMACS_MULE_COMPOSITION_END ();
2607             }
2608         }
2609       else if (cmp_status->state == COMPOSING_RULE)
2610         {
2611           int rule;
2612
2613           if (c >= 0)
2614             {
2615               EMACS_MULE_COMPOSITION_END ();
2616               *charbuf++ = c;
2617               char_offset++;
2618             }
2619           else
2620             {
2621               c = -c;
2622               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2623               if (rule < 0)
2624                 goto invalid_code;
2625               *charbuf++ = -2;
2626               *charbuf++ = rule;
2627               cmp_status->length += 2;
2628               cmp_status->state = COMPOSING_CHAR;
2629             }
2630         }
2631       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2632         {
2633           *charbuf++ = c;
2634           cmp_status->length++;
2635           if (cmp_status->ncomps == 0)
2636             cmp_status->state = COMPOSING_CHAR;
2637           else if (cmp_status->ncomps > 0)
2638             {
2639               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2640                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2641             }
2642           else
2643             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2644         }
2645       else                      /* COMPOSING_COMPONENT_RULE */
2646         {
2647           int rule;
2648
2649           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2650           if (rule < 0)
2651             goto invalid_code;
2652           *charbuf++ = -2;
2653           *charbuf++ = rule;
2654           cmp_status->length += 2;
2655           cmp_status->ncomps--;
2656           if (cmp_status->ncomps > 0)
2657             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2658           else
2659             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2660         }
2661       continue;
2662
2663     retry:
2664       src = src_base;
2665       consumed_chars = consumed_chars_base;
2666       continue;
2667
2668     invalid_code:
2669       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2670       src = src_base;
2671       consumed_chars = consumed_chars_base;
2672       ONE_MORE_BYTE (c);
2673       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2674       char_offset++;
2675       coding->errors++;
2676     }
2677
2678  no_more_source:
2679   if (cmp_status->state != COMPOSING_NO)
2680     {
2681       if (coding->mode & CODING_MODE_LAST_BLOCK)
2682         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2683       else
2684         {
2685           int i;
2686
2687           charbuf -= cmp_status->length;
2688           for (i = 0; i < cmp_status->length; i++)
2689             cmp_status->carryover[i] = charbuf[i];
2690         }
2691     }
2692   if (last_id != charset_ascii)
2693     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2694   coding->consumed_char += consumed_chars_base;
2695   coding->consumed = src_base - coding->source;
2696   coding->charbuf_used = charbuf - coding->charbuf;
2697 }
2698
2699
2700 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2701   do {                                          \
2702     if (id < 0xA0)                              \
2703       codes[0] = id, codes[1] = 0;              \
2704     else if (id < 0xE0)                         \
2705       codes[0] = 0x9A, codes[1] = id;           \
2706     else if (id < 0xF0)                         \
2707       codes[0] = 0x9B, codes[1] = id;           \
2708     else if (id < 0xF5)                         \
2709       codes[0] = 0x9C, codes[1] = id;           \
2710     else                                        \
2711       codes[0] = 0x9D, codes[1] = id;           \
2712   } while (0);
2713
2714
2715 static int
2716 encode_coding_emacs_mule (coding)
2717      struct coding_system *coding;
2718 {
2719   int multibytep = coding->dst_multibyte;
2720   int *charbuf = coding->charbuf;
2721   int *charbuf_end = charbuf + coding->charbuf_used;
2722   unsigned char *dst = coding->destination + coding->produced;
2723   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2724   int safe_room = 8;
2725   int produced_chars = 0;
2726   Lisp_Object attrs, charset_list;
2727   int c;
2728   int preferred_charset_id = -1;
2729
2730   CODING_GET_INFO (coding, attrs, charset_list);
2731   if (! EQ (charset_list, Vemacs_mule_charset_list))
2732     {
2733       CODING_ATTR_CHARSET_LIST (attrs)
2734         = charset_list = Vemacs_mule_charset_list;
2735     }
2736
2737   while (charbuf < charbuf_end)
2738     {
2739       ASSURE_DESTINATION (safe_room);
2740       c = *charbuf++;
2741
2742       if (c < 0)
2743         {
2744           /* Handle an annotation.  */
2745           switch (*charbuf)
2746             {
2747             case CODING_ANNOTATE_COMPOSITION_MASK:
2748               /* Not yet implemented.  */
2749               break;
2750             case CODING_ANNOTATE_CHARSET_MASK:
2751               preferred_charset_id = charbuf[3];
2752               if (preferred_charset_id >= 0
2753                   && NILP (Fmemq (make_number (preferred_charset_id),
2754                                   charset_list)))
2755                 preferred_charset_id = -1;
2756               break;
2757             default:
2758               abort ();
2759             }
2760           charbuf += -c - 1;
2761           continue;
2762         }
2763
2764       if (ASCII_CHAR_P (c))
2765         EMIT_ONE_ASCII_BYTE (c);
2766       else if (CHAR_BYTE8_P (c))
2767         {
2768           c = CHAR_TO_BYTE8 (c);
2769           EMIT_ONE_BYTE (c);
2770         }
2771       else
2772         {
2773           struct charset *charset;
2774           unsigned code;
2775           int dimension;
2776           int emacs_mule_id;
2777           unsigned char leading_codes[2];
2778
2779           if (preferred_charset_id >= 0)
2780             {
2781               charset = CHARSET_FROM_ID (preferred_charset_id);
2782               if (CHAR_CHARSET_P (c, charset))
2783                 code = ENCODE_CHAR (charset, c);
2784               else
2785                 charset = char_charset (c, charset_list, &code);
2786             }
2787           else
2788             charset = char_charset (c, charset_list, &code);
2789           if (! charset)
2790             {
2791               c = coding->default_char;
2792               if (ASCII_CHAR_P (c))
2793                 {
2794                   EMIT_ONE_ASCII_BYTE (c);
2795                   continue;
2796                 }
2797               charset = char_charset (c, charset_list, &code);
2798             }
2799           dimension = CHARSET_DIMENSION (charset);
2800           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2801           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2802           EMIT_ONE_BYTE (leading_codes[0]);
2803           if (leading_codes[1])
2804             EMIT_ONE_BYTE (leading_codes[1]);
2805           if (dimension == 1)
2806             EMIT_ONE_BYTE (code | 0x80);
2807           else
2808             {
2809               code |= 0x8080;
2810               EMIT_ONE_BYTE (code >> 8);
2811               EMIT_ONE_BYTE (code & 0xFF);
2812             }
2813         }
2814     }
2815   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2816   coding->produced_char += produced_chars;
2817   coding->produced = dst - coding->destination;
2818   return 0;
2819 }
2820
2821 \f
2822 /*** 7. ISO2022 handlers ***/
2823
2824 /* The following note describes the coding system ISO2022 briefly.
2825    Since the intention of this note is to help understand the
2826    functions in this file, some parts are NOT ACCURATE or are OVERLY
2827    SIMPLIFIED.  For thorough understanding, please refer to the
2828    original document of ISO2022.  This is equivalent to the standard
2829    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2830
2831    ISO2022 provides many mechanisms to encode several character sets
2832    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2833    is encoded using bytes less than 128.  This may make the encoded
2834    text a little bit longer, but the text passes more easily through
2835    several types of gateway, some of which strip off the MSB (Most
2836    Significant Bit).
2837
2838    There are two kinds of character sets: control character sets and
2839    graphic character sets.  The former contain control characters such
2840    as `newline' and `escape' to provide control functions (control
2841    functions are also provided by escape sequences).  The latter
2842    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2843    two control character sets and many graphic character sets.
2844
2845    Graphic character sets are classified into one of the following
2846    four classes, according to the number of bytes (DIMENSION) and
2847    number of characters in one dimension (CHARS) of the set:
2848    - DIMENSION1_CHARS94
2849    - DIMENSION1_CHARS96
2850    - DIMENSION2_CHARS94
2851    - DIMENSION2_CHARS96
2852
2853    In addition, each character set is assigned an identification tag,
2854    unique for each set, called the "final character" (denoted as <F>
2855    hereafter).  The <F> of each character set is decided by ECMA(*)
2856    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2857    (0x30..0x3F are for private use only).
2858
2859    Note (*): ECMA = European Computer Manufacturers Association
2860
2861    Here are examples of graphic character sets [NAME(<F>)]:
2862         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2863         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2864         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2865         o DIMENSION2_CHARS96 -- none for the moment
2866
2867    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2868         C0 [0x00..0x1F] -- control character plane 0
2869         GL [0x20..0x7F] -- graphic character plane 0
2870         C1 [0x80..0x9F] -- control character plane 1
2871         GR [0xA0..0xFF] -- graphic character plane 1
2872
2873    A control character set is directly designated and invoked to C0 or
2874    C1 by an escape sequence.  The most common case is that:
2875    - ISO646's  control character set is designated/invoked to C0, and
2876    - ISO6429's control character set is designated/invoked to C1,
2877    and usually these designations/invocations are omitted in encoded
2878    text.  In a 7-bit environment, only C0 can be used, and a control
2879    character for C1 is encoded by an appropriate escape sequence to
2880    fit into the environment.  All control characters for C1 are
2881    defined to have corresponding escape sequences.
2882
2883    A graphic character set is at first designated to one of four
2884    graphic registers (G0 through G3), then these graphic registers are
2885    invoked to GL or GR.  These designations and invocations can be
2886    done independently.  The most common case is that G0 is invoked to
2887    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2888    these invocations and designations are omitted in encoded text.
2889    In a 7-bit environment, only GL can be used.
2890
2891    When a graphic character set of CHARS94 is invoked to GL, codes
2892    0x20 and 0x7F of the GL area work as control characters SPACE and
2893    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2894    be used.
2895
2896    There are two ways of invocation: locking-shift and single-shift.
2897    With locking-shift, the invocation lasts until the next different
2898    invocation, whereas with single-shift, the invocation affects the
2899    following character only and doesn't affect the locking-shift
2900    state.  Invocations are done by the following control characters or
2901    escape sequences:
2902
2903    ----------------------------------------------------------------------
2904    abbrev  function                  cntrl escape seq   description
2905    ----------------------------------------------------------------------
2906    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2907    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2908    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2909    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2910    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2911    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2912    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2913    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2914    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2915    ----------------------------------------------------------------------
2916    (*) These are not used by any known coding system.
2917
2918    Control characters for these functions are defined by macros
2919    ISO_CODE_XXX in `coding.h'.
2920
2921    Designations are done by the following escape sequences:
2922    ----------------------------------------------------------------------
2923    escape sequence      description
2924    ----------------------------------------------------------------------
2925    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2926    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2927    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2928    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2929    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2930    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2931    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2932    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2933    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2934    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2935    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2936    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2937    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2938    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2939    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2940    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2941    ----------------------------------------------------------------------
2942
2943    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2944    of dimension 1, chars 94, and final character <F>, etc...
2945
2946    Note (*): Although these designations are not allowed in ISO2022,
2947    Emacs accepts them on decoding, and produces them on encoding
2948    CHARS96 character sets in a coding system which is characterized as
2949    7-bit environment, non-locking-shift, and non-single-shift.
2950
2951    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2952    '(' must be omitted.  We refer to this as "short-form" hereafter.
2953
2954    Now you may notice that there are a lot of ways of encoding the
2955    same multilingual text in ISO2022.  Actually, there exist many
2956    coding systems such as Compound Text (used in X11's inter client
2957    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2958    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2959    localized platforms), and all of these are variants of ISO2022.
2960
2961    In addition to the above, Emacs handles two more kinds of escape
2962    sequences: ISO6429's direction specification and Emacs' private
2963    sequence for specifying character composition.
2964
2965    ISO6429's direction specification takes the following form:
2966         o CSI ']'      -- end of the current direction
2967         o CSI '0' ']'  -- end of the current direction
2968         o CSI '1' ']'  -- start of left-to-right text
2969         o CSI '2' ']'  -- start of right-to-left text
2970    The control character CSI (0x9B: control sequence introducer) is
2971    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2972
2973    Character composition specification takes the following form:
2974         o ESC '0' -- start relative composition
2975         o ESC '1' -- end composition
2976         o ESC '2' -- start rule-base composition (*)
2977         o ESC '3' -- start relative composition with alternate chars  (**)
2978         o ESC '4' -- start rule-base composition with alternate chars  (**)
2979   Since these are not standard escape sequences of any ISO standard,
2980   the use of them with these meanings is restricted to Emacs only.
2981
2982   (*) This form is used only in Emacs 20.7 and older versions,
2983   but newer versions can safely decode it.
2984   (**) This form is used only in Emacs 21.1 and newer versions,
2985   and older versions can't decode it.
2986
2987   Here's a list of example usages of these composition escape
2988   sequences (categorized by `enum composition_method').
2989
2990   COMPOSITION_RELATIVE:
2991         ESC 0 CHAR [ CHAR ] ESC 1
2992   COMPOSITION_WITH_RULE:
2993         ESC 2 CHAR [ RULE CHAR ] ESC 1
2994   COMPOSITION_WITH_ALTCHARS:
2995         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2996   COMPOSITION_WITH_RULE_ALTCHARS:
2997         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2998
2999 enum iso_code_class_type iso_code_class[256];
3000
3001 #define SAFE_CHARSET_P(coding, id)      \
3002   ((id) <= (coding)->max_charset_id     \
3003    && (coding)->safe_charsets[id] != 255)
3004
3005
3006 #define SHIFT_OUT_OK(category)  \
3007   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
3008
3009 static void
3010 setup_iso_safe_charsets (attrs)
3011      Lisp_Object attrs;
3012 {
3013   Lisp_Object charset_list, safe_charsets;
3014   Lisp_Object request;
3015   Lisp_Object reg_usage;
3016   Lisp_Object tail;
3017   int reg94, reg96;
3018   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
3019   int max_charset_id;
3020
3021   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3022   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
3023       && ! EQ (charset_list, Viso_2022_charset_list))
3024     {
3025       CODING_ATTR_CHARSET_LIST (attrs)
3026         = charset_list = Viso_2022_charset_list;
3027       ASET (attrs, coding_attr_safe_charsets, Qnil);
3028     }
3029
3030   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3031     return;
3032
3033   max_charset_id = 0;
3034   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3035     {
3036       int id = XINT (XCAR (tail));
3037       if (max_charset_id < id)
3038         max_charset_id = id;
3039     }
3040
3041   safe_charsets = make_uninit_string (max_charset_id + 1);
3042   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3043   request = AREF (attrs, coding_attr_iso_request);
3044   reg_usage = AREF (attrs, coding_attr_iso_usage);
3045   reg94 = XINT (XCAR (reg_usage));
3046   reg96 = XINT (XCDR (reg_usage));
3047
3048   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3049     {
3050       Lisp_Object id;
3051       Lisp_Object reg;
3052       struct charset *charset;
3053
3054       id = XCAR (tail);
3055       charset = CHARSET_FROM_ID (XINT (id));
3056       reg = Fcdr (Fassq (id, request));
3057       if (! NILP (reg))
3058         SSET (safe_charsets, XINT (id), XINT (reg));
3059       else if (charset->iso_chars_96)
3060         {
3061           if (reg96 < 4)
3062             SSET (safe_charsets, XINT (id), reg96);
3063         }
3064       else
3065         {
3066           if (reg94 < 4)
3067             SSET (safe_charsets, XINT (id), reg94);
3068         }
3069     }
3070   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3071 }
3072
3073
3074 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3075    Check if a text is encoded in one of ISO-2022 based codig systems.
3076    If it is, return 1, else return 0.  */
3077
3078 static int
3079 detect_coding_iso_2022 (coding, detect_info)
3080      struct coding_system *coding;
3081      struct coding_detection_info *detect_info;
3082 {
3083   const unsigned char *src = coding->source, *src_base = src;
3084   const unsigned char *src_end = coding->source + coding->src_bytes;
3085   int multibytep = coding->src_multibyte;
3086   int single_shifting = 0;
3087   int id;
3088   int c, c1;
3089   int consumed_chars = 0;
3090   int i;
3091   int rejected = 0;
3092   int found = 0;
3093   int composition_count = -1;
3094
3095   detect_info->checked |= CATEGORY_MASK_ISO;
3096
3097   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3098     {
3099       struct coding_system *this = &(coding_categories[i]);
3100       Lisp_Object attrs, val;
3101
3102       if (this->id < 0)
3103         continue;
3104       attrs = CODING_ID_ATTRS (this->id);
3105       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3106           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3107         setup_iso_safe_charsets (attrs);
3108       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3109       this->max_charset_id = SCHARS (val) - 1;
3110       this->safe_charsets = SDATA (val);
3111     }
3112
3113   /* A coding system of this category is always ASCII compatible.  */
3114   src += coding->head_ascii;
3115
3116   while (rejected != CATEGORY_MASK_ISO)
3117     {
3118       src_base = src;
3119       ONE_MORE_BYTE (c);
3120       switch (c)
3121         {
3122         case ISO_CODE_ESC:
3123           if (inhibit_iso_escape_detection)
3124             break;
3125           single_shifting = 0;
3126           ONE_MORE_BYTE (c);
3127           if (c >= '(' && c <= '/')
3128             {
3129               /* Designation sequence for a charset of dimension 1.  */
3130               ONE_MORE_BYTE (c1);
3131               if (c1 < ' ' || c1 >= 0x80
3132                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3133                 /* Invalid designation sequence.  Just ignore.  */
3134                 break;
3135             }
3136           else if (c == '$')
3137             {
3138               /* Designation sequence for a charset of dimension 2.  */
3139               ONE_MORE_BYTE (c);
3140               if (c >= '@' && c <= 'B')
3141                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3142                 id = iso_charset_table[1][0][c];
3143               else if (c >= '(' && c <= '/')
3144                 {
3145                   ONE_MORE_BYTE (c1);
3146                   if (c1 < ' ' || c1 >= 0x80
3147                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3148                     /* Invalid designation sequence.  Just ignore.  */
3149                     break;
3150                 }
3151               else
3152                 /* Invalid designation sequence.  Just ignore it.  */
3153                 break;
3154             }
3155           else if (c == 'N' || c == 'O')
3156             {
3157               /* ESC <Fe> for SS2 or SS3.  */
3158               single_shifting = 1;
3159               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160               break;
3161             }
3162           else if (c == '1')
3163             {
3164               /* End of composition.  */
3165               if (composition_count < 0
3166                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3167                 /* Invalid */
3168                 break;
3169               composition_count = -1;
3170               found |= CATEGORY_MASK_ISO;
3171             }
3172           else if (c >= '0' && c <= '4')
3173             {
3174               /* ESC <Fp> for start/end composition.  */
3175               composition_count = 0;
3176               break;
3177             }
3178           else
3179             {
3180               /* Invalid escape sequence.  Just ignore it.  */
3181               break;
3182             }
3183
3184           /* We found a valid designation sequence for CHARSET.  */
3185           rejected |= CATEGORY_MASK_ISO_8BIT;
3186           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3187                               id))
3188             found |= CATEGORY_MASK_ISO_7;
3189           else
3190             rejected |= CATEGORY_MASK_ISO_7;
3191           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3192                               id))
3193             found |= CATEGORY_MASK_ISO_7_TIGHT;
3194           else
3195             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3196           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3197                               id))
3198             found |= CATEGORY_MASK_ISO_7_ELSE;
3199           else
3200             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3201           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3202                               id))
3203             found |= CATEGORY_MASK_ISO_8_ELSE;
3204           else
3205             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3206           break;
3207
3208         case ISO_CODE_SO:
3209         case ISO_CODE_SI:
3210           /* Locking shift out/in.  */
3211           if (inhibit_iso_escape_detection)
3212             break;
3213           single_shifting = 0;
3214           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3215           break;
3216
3217         case ISO_CODE_CSI:
3218           /* Control sequence introducer.  */
3219           single_shifting = 0;
3220           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3221           found |= CATEGORY_MASK_ISO_8_ELSE;
3222           goto check_extra_latin;
3223
3224         case ISO_CODE_SS2:
3225         case ISO_CODE_SS3:
3226           /* Single shift.   */
3227           if (inhibit_iso_escape_detection)
3228             break;
3229           single_shifting = 0;
3230           rejected |= CATEGORY_MASK_ISO_7BIT;
3231           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3232               & CODING_ISO_FLAG_SINGLE_SHIFT)
3233             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3234           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3235               & CODING_ISO_FLAG_SINGLE_SHIFT)
3236             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3237           if (single_shifting)
3238             break;
3239           goto check_extra_latin;
3240
3241         default:
3242           if (c < 0)
3243             continue;
3244           if (c < 0x80)
3245             {
3246               if (composition_count >= 0)
3247                 composition_count++;
3248               single_shifting = 0;
3249               break;
3250             }
3251           if (c >= 0xA0)
3252             {
3253               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3254               found |= CATEGORY_MASK_ISO_8_1;
3255               /* Check the length of succeeding codes of the range
3256                  0xA0..0FF.  If the byte length is even, we include
3257                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3258                  only when we are not single shifting.  */
3259               if (! single_shifting
3260                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3261                 {
3262                   int i = 1;
3263                   while (src < src_end)
3264                     {
3265                       src_base = src;
3266                       ONE_MORE_BYTE (c);
3267                       if (c < 0xA0)
3268                         {
3269                           src = src_base;
3270                           break;
3271                         }
3272                       i++;
3273                     }
3274
3275                   if (i & 1 && src < src_end)
3276                     {
3277                       rejected |= CATEGORY_MASK_ISO_8_2;
3278                       if (composition_count >= 0)
3279                         composition_count += i;
3280                     }
3281                   else
3282                     {
3283                       found |= CATEGORY_MASK_ISO_8_2;
3284                       if (composition_count >= 0)
3285                         composition_count += i / 2;
3286                     }
3287                 }
3288               break;
3289             }
3290         check_extra_latin:
3291           single_shifting = 0;
3292           if (! VECTORP (Vlatin_extra_code_table)
3293               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3294             {
3295               rejected = CATEGORY_MASK_ISO;
3296               break;
3297             }
3298           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3299               & CODING_ISO_FLAG_LATIN_EXTRA)
3300             found |= CATEGORY_MASK_ISO_8_1;
3301           else
3302             rejected |= CATEGORY_MASK_ISO_8_1;
3303           rejected |= CATEGORY_MASK_ISO_8_2;
3304         }
3305     }
3306   detect_info->rejected |= CATEGORY_MASK_ISO;
3307   return 0;
3308
3309  no_more_source:
3310   detect_info->rejected |= rejected;
3311   detect_info->found |= (found & ~rejected);
3312   return 1;
3313 }
3314
3315
3316 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3317    escape sequence should be kept.  */
3318 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3319   do {                                                                  \
3320     int id, prev;                                                       \
3321                                                                         \
3322     if (final < '0' || final >= 128                                     \
3323         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3324         || !SAFE_CHARSET_P (coding, id))                                \
3325       {                                                                 \
3326         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3327         chars_96 = -1;                                                  \
3328         break;                                                          \
3329       }                                                                 \
3330     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3331     if (id == charset_jisx0201_roman)                                   \
3332       {                                                                 \
3333         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3334           id = charset_ascii;                                           \
3335       }                                                                 \
3336     else if (id == charset_jisx0208_1978)                               \
3337       {                                                                 \
3338         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3339           id = charset_jisx0208;                                        \
3340       }                                                                 \
3341     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3342     /* If there was an invalid designation to REG previously, and this  \
3343        designation is ASCII to REG, we should keep this designation     \
3344        sequence.  */                                                    \
3345     if (prev == -2 && id == charset_ascii)                              \
3346       chars_96 = -1;                                                    \
3347   } while (0)
3348
3349
3350 /* Handle these composition sequence (ALT: alternate char):
3351
3352    (1) relative composition: ESC 0 CHAR ... ESC 1
3353    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3354    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3355    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3356
3357    When the start sequence (ESC 0/2/3/4) is found, this annotation
3358    header is produced.
3359
3360         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3361
3362    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3363    produced until the end sequence (ESC 1) is found:
3364
3365    (1) CHAR ... CHAR
3366    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3367    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3368    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3369
3370    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3371    annotation header is updated as below:
3372
3373    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3374    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3375    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3376    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3377
3378    If an error is found while composing, the annotation header is
3379    changed to:
3380
3381         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3382
3383    and the sequence [ -2 DECODED-RULE ] is changed to the original
3384    byte sequence as below:
3385         o the original byte sequence is B: [ B -1 ]
3386         o the original byte sequence is B1 B2: [ B1 B2 ]
3387    and the sequence [ -1 -1 ] is changed to the original byte
3388    sequence:
3389         [ ESC '0' ]
3390 */
3391
3392 /* Decode a composition rule C1 and maybe one more byte from the
3393    source, and set RULE to the encoded composition rule, NBYTES to the
3394    length of the composition rule.  If the rule is invalid, set RULE
3395    to some negative value.  */
3396
3397 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3398   do {                                                                  \
3399     rule = c1 - 32;                                                     \
3400     if (rule < 0)                                                       \
3401       break;                                                            \
3402     if (rule < 81)              /* old format (before ver.21) */        \
3403       {                                                                 \
3404         int gref = (rule) / 9;                                          \
3405         int nref = (rule) % 9;                                          \
3406         if (gref == 4) gref = 10;                                       \
3407         if (nref == 4) nref = 10;                                       \
3408         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3409         nbytes = 1;                                                     \
3410       }                                                                 \
3411     else                        /* new format (after ver.21) */         \
3412       {                                                                 \
3413         int c;                                                          \
3414                                                                         \
3415         ONE_MORE_BYTE (c);                                              \
3416         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3417         if (rule >= 0)                                                  \
3418           rule += 0x100;   /* to destinguish it from the old format */  \
3419         nbytes = 2;                                                     \
3420       }                                                                 \
3421   } while (0)
3422
3423 #define ENCODE_COMPOSITION_RULE(rule)                           \
3424   do {                                                          \
3425     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3426                                                                 \
3427     if (rule < 0x100)           /* old format */                \
3428       {                                                         \
3429         if (gref == 10) gref = 4;                               \
3430         if (nref == 10) nref = 4;                               \
3431         charbuf[idx] = 32 + gref * 9 + nref;                    \
3432         charbuf[idx + 1] = -1;                                  \
3433         new_chars++;                                            \
3434       }                                                         \
3435     else                                /* new format */        \
3436       {                                                         \
3437         charbuf[idx] = 32 + 81 + gref;                          \
3438         charbuf[idx + 1] = 32 + nref;                           \
3439         new_chars += 2;                                         \
3440       }                                                         \
3441   } while (0)
3442
3443 /* Finish the current composition as invalid.  */
3444
3445 static int finish_composition P_ ((int *, struct composition_status *));
3446
3447 static int
3448 finish_composition (charbuf, cmp_status)
3449      int *charbuf;
3450      struct composition_status *cmp_status;
3451 {
3452   int idx = - cmp_status->length;
3453   int new_chars;
3454
3455   /* Recover the original ESC sequence */
3456   charbuf[idx++] = ISO_CODE_ESC;
3457   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3458                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3459                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3460                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3461                     : '4');
3462   charbuf[idx++] = -2;
3463   charbuf[idx++] = 0;
3464   charbuf[idx++] = -1;
3465   new_chars = cmp_status->nchars;
3466   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3467     for (; idx < 0; idx++)
3468       {
3469         int elt = charbuf[idx];
3470
3471         if (elt == -2)
3472           {
3473             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3474             idx++;
3475           }
3476         else if (elt == -1)
3477           {
3478             charbuf[idx++] = ISO_CODE_ESC;
3479             charbuf[idx] = '0';
3480             new_chars += 2;
3481           }
3482       }
3483   cmp_status->state = COMPOSING_NO;
3484   return new_chars;
3485 }
3486
3487 /* If characers are under composition, finish the composition.  */
3488 #define MAYBE_FINISH_COMPOSITION()                              \
3489   do {                                                          \
3490     if (cmp_status->state != COMPOSING_NO)                      \
3491       char_offset += finish_composition (charbuf, cmp_status);  \
3492   } while (0)
3493
3494 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3495
3496    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3497    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3498    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3499    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3500
3501    Produce this annotation sequence now:
3502
3503    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3504 */
3505
3506 #define DECODE_COMPOSITION_START(c1)                                       \
3507   do {                                                                     \
3508     if (c1 == '0'                                                          \
3509         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3510              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3511             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3512                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3513       {                                                                    \
3514         *charbuf++ = -1;                                                   \
3515         *charbuf++= -1;                                                    \
3516         cmp_status->state = COMPOSING_CHAR;                                \
3517         cmp_status->length += 2;                                           \
3518       }                                                                    \
3519     else                                                                   \
3520       {                                                                    \
3521         MAYBE_FINISH_COMPOSITION ();                                       \
3522         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3523                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3524                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3525                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3526         cmp_status->state                                                  \
3527           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3528         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3529         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3530         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3531         coding->annotated = 1;                                             \
3532       }                                                                    \
3533   } while (0)
3534
3535
3536 /* Handle composition end sequence ESC 1.  */
3537
3538 #define DECODE_COMPOSITION_END()                                        \
3539   do {                                                                  \
3540     if (cmp_status->nchars == 0                                         \
3541         || ((cmp_status->state == COMPOSING_CHAR)                       \
3542             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3543       {                                                                 \
3544         MAYBE_FINISH_COMPOSITION ();                                    \
3545         goto invalid_code;                                              \
3546       }                                                                 \
3547     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3548       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3549     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3550       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3551     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3552     char_offset += cmp_status->nchars;                                  \
3553     cmp_status->state = COMPOSING_NO;                                   \
3554   } while (0)
3555
3556 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3557
3558 #define STORE_COMPOSITION_RULE(rule)    \
3559   do {                                  \
3560     *charbuf++ = -2;                    \
3561     *charbuf++ = rule;                  \
3562     cmp_status->length += 2;            \
3563     cmp_status->state--;                \
3564   } while (0)
3565
3566 /* Store a composed char or a component char C in charbuf, and update
3567    cmp_status.  */
3568
3569 #define STORE_COMPOSITION_CHAR(c)                                       \
3570   do {                                                                  \
3571     *charbuf++ = (c);                                                   \
3572     cmp_status->length++;                                               \
3573     if (cmp_status->state == COMPOSING_CHAR)                            \
3574       cmp_status->nchars++;                                             \
3575     else                                                                \
3576       cmp_status->ncomps++;                                             \
3577     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3578         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3579             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3580       cmp_status->state++;                                              \
3581   } while (0)
3582
3583
3584 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3585
3586 static void
3587 decode_coding_iso_2022 (coding)
3588      struct coding_system *coding;
3589 {
3590   const unsigned char *src = coding->source + coding->consumed;
3591   const unsigned char *src_end = coding->source + coding->src_bytes;
3592   const unsigned char *src_base;
3593   int *charbuf = coding->charbuf + coding->charbuf_used;
3594   /* We may produce two annocations (charset and composition) in one
3595      loop and one more charset annocation at the end.  */
3596   int *charbuf_end
3597     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3598   int consumed_chars = 0, consumed_chars_base;
3599   int multibytep = coding->src_multibyte;
3600   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3601   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3602   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3603   int charset_id_2, charset_id_3;
3604   struct charset *charset;
3605   int c;
3606   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3607   Lisp_Object attrs, charset_list;
3608   int char_offset = coding->produced_char;
3609   int last_offset = char_offset;
3610   int last_id = charset_ascii;
3611   int eol_crlf =
3612     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3613   int byte_after_cr = -1;
3614   int i;
3615
3616   CODING_GET_INFO (coding, attrs, charset_list);
3617   setup_iso_safe_charsets (attrs);
3618   /* Charset list may have been changed.  */
3619   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3620   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3621
3622   if (cmp_status->state != COMPOSING_NO)
3623     {
3624       for (i = 0; i < cmp_status->length; i++)
3625         *charbuf++ = cmp_status->carryover[i];
3626       coding->annotated = 1;
3627     }
3628
3629   while (1)
3630     {
3631       int c1, c2, c3;
3632
3633       src_base = src;
3634       consumed_chars_base = consumed_chars;
3635
3636       if (charbuf >= charbuf_end)
3637         {
3638           if (byte_after_cr >= 0)
3639             src_base--;
3640           break;
3641         }
3642
3643       if (byte_after_cr >= 0)
3644         c1 = byte_after_cr, byte_after_cr = -1;
3645       else
3646         ONE_MORE_BYTE (c1);
3647       if (c1 < 0)
3648         goto invalid_code;
3649
3650       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3651         {
3652           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3653           char_offset++;
3654           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3655           continue;
3656         }
3657
3658       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3659         {
3660           if (c1 == ISO_CODE_ESC)
3661             {
3662               if (src + 1 >= src_end)
3663                 goto no_more_source;
3664               *charbuf++ = ISO_CODE_ESC;
3665               char_offset++;
3666               if (src[0] == '%' && src[1] == '@')
3667                 {
3668                   src += 2;
3669                   consumed_chars += 2;
3670                   char_offset += 2;
3671                   /* We are sure charbuf can contain two more chars. */
3672                   *charbuf++ = '%';
3673                   *charbuf++ = '@';
3674                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3675                 }
3676             }
3677           else
3678             {
3679               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3680               char_offset++;
3681             }
3682           continue;
3683         }
3684
3685       if ((cmp_status->state == COMPOSING_RULE
3686            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3687           && c1 != ISO_CODE_ESC)
3688         {
3689           int rule, nbytes;
3690
3691           DECODE_COMPOSITION_RULE (rule, nbytes);
3692           if (rule < 0)
3693             goto invalid_code;
3694           STORE_COMPOSITION_RULE (rule);
3695           continue;
3696         }
3697
3698       /* We produce at most one character.  */
3699       switch (iso_code_class [c1])
3700         {
3701         case ISO_0x20_or_0x7F:
3702           if (charset_id_0 < 0
3703               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3704             /* This is SPACE or DEL.  */
3705             charset = CHARSET_FROM_ID (charset_ascii);
3706           else
3707             charset = CHARSET_FROM_ID (charset_id_0);
3708           break;
3709
3710         case ISO_graphic_plane_0:
3711           if (charset_id_0 < 0)
3712             charset = CHARSET_FROM_ID (charset_ascii);
3713           else
3714             charset = CHARSET_FROM_ID (charset_id_0);
3715           break;
3716
3717         case ISO_0xA0_or_0xFF:
3718           if (charset_id_1 < 0
3719               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3720               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3721             goto invalid_code;
3722           /* This is a graphic character, we fall down ... */
3723
3724         case ISO_graphic_plane_1:
3725           if (charset_id_1 < 0)
3726             goto invalid_code;
3727           charset = CHARSET_FROM_ID (charset_id_1);
3728           break;
3729
3730         case ISO_control_0:
3731           if (eol_crlf && c1 == '\r')
3732             ONE_MORE_BYTE (byte_after_cr);
3733           MAYBE_FINISH_COMPOSITION ();
3734           charset = CHARSET_FROM_ID (charset_ascii);
3735           break;
3736
3737         case ISO_control_1:
3738           goto invalid_code;
3739
3740         case ISO_shift_out:
3741           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3742               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3743             goto invalid_code;
3744           CODING_ISO_INVOCATION (coding, 0) = 1;
3745           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3746           continue;
3747
3748         case ISO_shift_in:
3749           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3750             goto invalid_code;
3751           CODING_ISO_INVOCATION (coding, 0) = 0;
3752           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3753           continue;
3754
3755         case ISO_single_shift_2_7:
3756           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3757             goto invalid_code;
3758         case ISO_single_shift_2:
3759           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3760             goto invalid_code;
3761           /* SS2 is handled as an escape sequence of ESC 'N' */
3762           c1 = 'N';
3763           goto label_escape_sequence;
3764
3765         case ISO_single_shift_3:
3766           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3767             goto invalid_code;
3768           /* SS2 is handled as an escape sequence of ESC 'O' */
3769           c1 = 'O';
3770           goto label_escape_sequence;
3771
3772         case ISO_control_sequence_introducer:
3773           /* CSI is handled as an escape sequence of ESC '[' ...  */
3774           c1 = '[';
3775           goto label_escape_sequence;
3776
3777         case ISO_escape:
3778           ONE_MORE_BYTE (c1);
3779         label_escape_sequence:
3780           /* Escape sequences handled here are invocation,
3781              designation, direction specification, and character
3782              composition specification.  */
3783           switch (c1)
3784             {
3785             case '&':           /* revision of following character set */
3786               ONE_MORE_BYTE (c1);
3787               if (!(c1 >= '@' && c1 <= '~'))
3788                 goto invalid_code;
3789               ONE_MORE_BYTE (c1);
3790               if (c1 != ISO_CODE_ESC)
3791                 goto invalid_code;
3792               ONE_MORE_BYTE (c1);
3793               goto label_escape_sequence;
3794
3795             case '$':           /* designation of 2-byte character set */
3796               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3797                 goto invalid_code;
3798               {
3799                 int reg, chars96;
3800
3801                 ONE_MORE_BYTE (c1);
3802                 if (c1 >= '@' && c1 <= 'B')
3803                   {     /* designation of JISX0208.1978, GB2312.1980,
3804                            or JISX0208.1980 */
3805                     reg = 0, chars96 = 0;
3806                   }
3807                 else if (c1 >= 0x28 && c1 <= 0x2B)
3808                   { /* designation of DIMENSION2_CHARS94 character set */
3809                     reg = c1 - 0x28, chars96 = 0;
3810                     ONE_MORE_BYTE (c1);
3811                   }
3812                 else if (c1 >= 0x2C && c1 <= 0x2F)
3813                   { /* designation of DIMENSION2_CHARS96 character set */
3814                     reg = c1 - 0x2C, chars96 = 1;
3815                     ONE_MORE_BYTE (c1);
3816                   }
3817                 else
3818                   goto invalid_code;
3819                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3820                 /* We must update these variables now.  */
3821                 if (reg == 0)
3822                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3823                 else if (reg == 1)
3824                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3825                 if (chars96 < 0)
3826                   goto invalid_code;
3827               }
3828               continue;
3829
3830             case 'n':           /* invocation of locking-shift-2 */
3831               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3832                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3833                 goto invalid_code;
3834               CODING_ISO_INVOCATION (coding, 0) = 2;
3835               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3836               continue;
3837
3838             case 'o':           /* invocation of locking-shift-3 */
3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3840                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3841                 goto invalid_code;
3842               CODING_ISO_INVOCATION (coding, 0) = 3;
3843               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3844               continue;
3845
3846             case 'N':           /* invocation of single-shift-2 */
3847               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3848                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3849                 goto invalid_code;
3850               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3851               if (charset_id_2 < 0)
3852                 charset = CHARSET_FROM_ID (charset_ascii);
3853               else
3854                 charset = CHARSET_FROM_ID (charset_id_2);
3855               ONE_MORE_BYTE (c1);
3856               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3857                 goto invalid_code;
3858               break;
3859
3860             case 'O':           /* invocation of single-shift-3 */
3861               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3862                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3863                 goto invalid_code;
3864               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3865               if (charset_id_3 < 0)
3866                 charset = CHARSET_FROM_ID (charset_ascii);
3867               else
3868                 charset = CHARSET_FROM_ID (charset_id_3);
3869               ONE_MORE_BYTE (c1);
3870               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3871                 goto invalid_code;
3872               break;
3873
3874             case '0': case '2': case '3': case '4': /* start composition */
3875               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3876                 goto invalid_code;
3877               if (last_id != charset_ascii)
3878                 {
3879                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3880                   last_id = charset_ascii;
3881                   last_offset = char_offset;
3882                 }
3883               DECODE_COMPOSITION_START (c1);
3884               continue;
3885
3886             case '1':           /* end composition */
3887               if (cmp_status->state == COMPOSING_NO)
3888                 goto invalid_code;
3889               DECODE_COMPOSITION_END ();
3890               continue;
3891
3892             case '[':           /* specification of direction */
3893               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3894                 goto invalid_code;
3895               /* For the moment, nested direction is not supported.
3896                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3897                  left-to-right, and nozero means right-to-left.  */
3898               ONE_MORE_BYTE (c1);
3899               switch (c1)
3900                 {
3901                 case ']':       /* end of the current direction */
3902                   coding->mode &= ~CODING_MODE_DIRECTION;
3903
3904                 case '0':       /* end of the current direction */
3905                 case '1':       /* start of left-to-right direction */
3906                   ONE_MORE_BYTE (c1);
3907                   if (c1 == ']')
3908                     coding->mode &= ~CODING_MODE_DIRECTION;
3909                   else
3910                     goto invalid_code;
3911                   break;
3912
3913                 case '2':       /* start of right-to-left direction */
3914                   ONE_MORE_BYTE (c1);
3915                   if (c1 == ']')
3916                     coding->mode |= CODING_MODE_DIRECTION;
3917                   else
3918                     goto invalid_code;
3919                   break;
3920
3921                 default:
3922                   goto invalid_code;
3923                 }
3924               continue;
3925
3926             case '%':
3927               ONE_MORE_BYTE (c1);
3928               if (c1 == '/')
3929                 {
3930                   /* CTEXT extended segment:
3931                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3932                      We keep these bytes as is for the moment.
3933                      They may be decoded by post-read-conversion.  */
3934                   int dim, M, L;
3935                   int size;
3936
3937                   ONE_MORE_BYTE (dim);
3938                   if (dim < '0' || dim > '4')
3939                     goto invalid_code;
3940                   ONE_MORE_BYTE (M);
3941                   if (M < 128)
3942                     goto invalid_code;
3943                   ONE_MORE_BYTE (L);
3944                   if (L < 128)
3945                     goto invalid_code;
3946                   size = ((M - 128) * 128) + (L - 128);
3947                   if (charbuf + 6 > charbuf_end)
3948                     goto break_loop;
3949                   *charbuf++ = ISO_CODE_ESC;
3950                   *charbuf++ = '%';
3951                   *charbuf++ = '/';
3952                   *charbuf++ = dim;
3953                   *charbuf++ = BYTE8_TO_CHAR (M);
3954                   *charbuf++ = BYTE8_TO_CHAR (L);
3955                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3956                 }
3957               else if (c1 == 'G')
3958                 {
3959                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3960                      ESC % G --UTF-8-BYTES-- ESC % @
3961                      We keep these bytes as is for the moment.
3962                      They may be decoded by post-read-conversion.  */
3963                   if (charbuf + 3 > charbuf_end)
3964                     goto break_loop;
3965                   *charbuf++ = ISO_CODE_ESC;
3966                   *charbuf++ = '%';
3967                   *charbuf++ = 'G';
3968                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3969                 }
3970               else
3971                 goto invalid_code;
3972               continue;
3973               break;
3974
3975             default:
3976               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3977                 goto invalid_code;
3978               {
3979                 int reg, chars96;
3980
3981                 if (c1 >= 0x28 && c1 <= 0x2B)
3982                   { /* designation of DIMENSION1_CHARS94 character set */
3983                     reg = c1 - 0x28, chars96 = 0;
3984                     ONE_MORE_BYTE (c1);
3985                   }
3986                 else if (c1 >= 0x2C && c1 <= 0x2F)
3987                   { /* designation of DIMENSION1_CHARS96 character set */
3988                     reg = c1 - 0x2C, chars96 = 1;
3989                     ONE_MORE_BYTE (c1);
3990                   }
3991                 else
3992                   goto invalid_code;
3993                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3994                 /* We must update these variables now.  */
3995                 if (reg == 0)
3996                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3997                 else if (reg == 1)
3998                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3999                 if (chars96 < 0)
4000                   goto invalid_code;
4001               }
4002               continue;
4003             }
4004         }
4005
4006       if (cmp_status->state == COMPOSING_NO
4007           && charset->id != charset_ascii
4008           && last_id != charset->id)
4009         {
4010           if (last_id != charset_ascii)
4011             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4012           last_id = charset->id;
4013           last_offset = char_offset;
4014         }
4015
4016       /* Now we know CHARSET and 1st position code C1 of a character.
4017          Produce a decoded character while getting 2nd and 3rd
4018          position codes C2, C3 if necessary.  */
4019       if (CHARSET_DIMENSION (charset) > 1)
4020         {
4021           ONE_MORE_BYTE (c2);
4022           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
4023               || ((c1 & 0x80) != (c2 & 0x80)))
4024             /* C2 is not in a valid range.  */
4025             goto invalid_code;
4026           if (CHARSET_DIMENSION (charset) == 2)
4027             c1 = (c1 << 8) | c2;
4028           else
4029             {
4030               ONE_MORE_BYTE (c3);
4031               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4032                   || ((c1 & 0x80) != (c3 & 0x80)))
4033                 /* C3 is not in a valid range.  */
4034                 goto invalid_code;
4035               c1 = (c1 << 16) | (c2 << 8) | c2;
4036             }
4037         }
4038       c1 &= 0x7F7F7F;
4039       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4040       if (c < 0)
4041         {
4042           MAYBE_FINISH_COMPOSITION ();
4043           for (; src_base < src; src_base++, char_offset++)
4044             {
4045               if (ASCII_BYTE_P (*src_base))
4046                 *charbuf++ = *src_base;
4047               else
4048                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4049             }
4050         }
4051       else if (cmp_status->state == COMPOSING_NO)
4052         {
4053           *charbuf++ = c;
4054           char_offset++;
4055         }
4056       else if ((cmp_status->state == COMPOSING_CHAR
4057                 ? cmp_status->nchars
4058                 : cmp_status->ncomps)
4059                >= MAX_COMPOSITION_COMPONENTS)
4060         {
4061           /* Too long composition.  */
4062           MAYBE_FINISH_COMPOSITION ();
4063           *charbuf++ = c;
4064           char_offset++;
4065         }
4066       else
4067         STORE_COMPOSITION_CHAR (c);
4068       continue;
4069
4070     invalid_code:
4071       MAYBE_FINISH_COMPOSITION ();
4072       src = src_base;
4073       consumed_chars = consumed_chars_base;
4074       ONE_MORE_BYTE (c);
4075       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4076       char_offset++;
4077       coding->errors++;
4078       continue;
4079
4080     break_loop:
4081       break;
4082     }
4083
4084  no_more_source:
4085   if (cmp_status->state != COMPOSING_NO)
4086     {
4087       if (coding->mode & CODING_MODE_LAST_BLOCK)
4088         MAYBE_FINISH_COMPOSITION ();
4089       else
4090         {
4091           charbuf -= cmp_status->length;
4092           for (i = 0; i < cmp_status->length; i++)
4093             cmp_status->carryover[i] = charbuf[i];
4094         }
4095     }
4096   else if (last_id != charset_ascii)
4097     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4098   coding->consumed_char += consumed_chars_base;
4099   coding->consumed = src_base - coding->source;
4100   coding->charbuf_used = charbuf - coding->charbuf;
4101 }
4102
4103
4104 /* ISO2022 encoding stuff.  */
4105
4106 /*
4107    It is not enough to say just "ISO2022" on encoding, we have to
4108    specify more details.  In Emacs, each coding system of ISO2022
4109    variant has the following specifications:
4110         1. Initial designation to G0 thru G3.
4111         2. Allows short-form designation?
4112         3. ASCII should be designated to G0 before control characters?
4113         4. ASCII should be designated to G0 at end of line?
4114         5. 7-bit environment or 8-bit environment?
4115         6. Use locking-shift?
4116         7. Use Single-shift?
4117    And the following two are only for Japanese:
4118         8. Use ASCII in place of JIS0201-1976-Roman?
4119         9. Use JISX0208-1983 in place of JISX0208-1978?
4120    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4121    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4122    details.
4123 */
4124
4125 /* Produce codes (escape sequence) for designating CHARSET to graphic
4126    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4127    '@', 'A', or 'B' and the coding system CODING allows, produce
4128    designation sequence of short-form.  */
4129
4130 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4131   do {                                                                  \
4132     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4133     char *intermediate_char_94 = "()*+";                                \
4134     char *intermediate_char_96 = ",-./";                                \
4135     int revision = -1;                                                  \
4136     int c;                                                              \
4137                                                                         \
4138     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4139       revision = CHARSET_ISO_REVISION (charset);                        \
4140                                                                         \
4141     if (revision >= 0)                                                  \
4142       {                                                                 \
4143         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4144         EMIT_ONE_BYTE ('@' + revision);                                 \
4145       }                                                                 \
4146     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4147     if (CHARSET_DIMENSION (charset) == 1)                               \
4148       {                                                                 \
4149         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4150           c = intermediate_char_94[reg];                                \
4151         else                                                            \
4152           c = intermediate_char_96[reg];                                \
4153         EMIT_ONE_ASCII_BYTE (c);                                        \
4154       }                                                                 \
4155     else                                                                \
4156       {                                                                 \
4157         EMIT_ONE_ASCII_BYTE ('$');                                      \
4158         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4159           {                                                             \
4160             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4161                 || reg != 0                                             \
4162                 || final_char < '@' || final_char > 'B')                \
4163               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4164           }                                                             \
4165         else                                                            \
4166           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4167       }                                                                 \
4168     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4169                                                                         \
4170     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4171   } while (0)
4172
4173
4174 /* The following two macros produce codes (control character or escape
4175    sequence) for ISO2022 single-shift functions (single-shift-2 and
4176    single-shift-3).  */
4177
4178 #define ENCODE_SINGLE_SHIFT_2                                           \
4179   do {                                                                  \
4180     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4181       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4182     else                                                                \
4183       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4184     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4185   } while (0)
4186
4187
4188 #define ENCODE_SINGLE_SHIFT_3                                           \
4189   do {                                                                  \
4190     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4191       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4192     else                                                                \
4193       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4194     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4195   } while (0)
4196
4197
4198 /* The following four macros produce codes (control character or
4199    escape sequence) for ISO2022 locking-shift functions (shift-in,
4200    shift-out, locking-shift-2, and locking-shift-3).  */
4201
4202 #define ENCODE_SHIFT_IN                                 \
4203   do {                                                  \
4204     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4205     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4206   } while (0)
4207
4208
4209 #define ENCODE_SHIFT_OUT                                \
4210   do {                                                  \
4211     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4212     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4213   } while (0)
4214
4215
4216 #define ENCODE_LOCKING_SHIFT_2                          \
4217   do {                                                  \
4218     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4219     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4220   } while (0)
4221
4222
4223 #define ENCODE_LOCKING_SHIFT_3                          \
4224   do {                                                  \
4225     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4226     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4227   } while (0)
4228
4229
4230 /* Produce codes for a DIMENSION1 character whose character set is
4231    CHARSET and whose position-code is C1.  Designation and invocation
4232    sequences are also produced in advance if necessary.  */
4233
4234 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4235   do {                                                                  \
4236     int id = CHARSET_ID (charset);                                      \
4237                                                                         \
4238     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4239         && id == charset_ascii)                                         \
4240       {                                                                 \
4241         id = charset_jisx0201_roman;                                    \
4242         charset = CHARSET_FROM_ID (id);                                 \
4243       }                                                                 \
4244                                                                         \
4245     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4246       {                                                                 \
4247         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4248           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4249         else                                                            \
4250           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4251         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4252         break;                                                          \
4253       }                                                                 \
4254     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4255       {                                                                 \
4256         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4257         break;                                                          \
4258       }                                                                 \
4259     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4260       {                                                                 \
4261         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4262         break;                                                          \
4263       }                                                                 \
4264     else                                                                \
4265       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4266          must invoke it, or, at first, designate it to some graphic     \
4267          register.  Then repeat the loop to actually produce the        \
4268          character.  */                                                 \
4269       dst = encode_invocation_designation (charset, coding, dst,        \
4270                                            &produced_chars);            \
4271   } while (1)
4272
4273
4274 /* Produce codes for a DIMENSION2 character whose character set is
4275    CHARSET and whose position-codes are C1 and C2.  Designation and
4276    invocation codes are also produced in advance if necessary.  */
4277
4278 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4279   do {                                                                  \
4280     int id = CHARSET_ID (charset);                                      \
4281                                                                         \
4282     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4283         && id == charset_jisx0208)                                      \
4284       {                                                                 \
4285         id = charset_jisx0208_1978;                                     \
4286         charset = CHARSET_FROM_ID (id);                                 \
4287       }                                                                 \
4288                                                                         \
4289     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4290       {                                                                 \
4291         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4292           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4293         else                                                            \
4294           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4295         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4296         break;                                                          \
4297       }                                                                 \
4298     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4299       {                                                                 \
4300         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4301         break;                                                          \
4302       }                                                                 \
4303     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4304       {                                                                 \
4305         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4306         break;                                                          \
4307       }                                                                 \
4308     else                                                                \
4309       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4310          must invoke it, or, at first, designate it to some graphic     \
4311          register.  Then repeat the loop to actually produce the        \
4312          character.  */                                                 \
4313       dst = encode_invocation_designation (charset, coding, dst,        \
4314                                            &produced_chars);            \
4315   } while (1)
4316
4317
4318 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4319   do {                                                                     \
4320     int code = ENCODE_CHAR ((charset),(c));                                \
4321                                                                            \
4322     if (CHARSET_DIMENSION (charset) == 1)                                  \
4323       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4324     else                                                                   \
4325       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4326   } while (0)
4327
4328
4329 /* Produce designation and invocation codes at a place pointed by DST
4330    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4331    Return new DST.  */
4332
4333 unsigned char *
4334 encode_invocation_designation (charset, coding, dst, p_nchars)
4335      struct charset *charset;
4336      struct coding_system *coding;
4337      unsigned char *dst;
4338      int *p_nchars;
4339 {
4340   int multibytep = coding->dst_multibyte;
4341   int produced_chars = *p_nchars;
4342   int reg;                      /* graphic register number */
4343   int id = CHARSET_ID (charset);
4344
4345   /* At first, check designations.  */
4346   for (reg = 0; reg < 4; reg++)
4347     if (id == CODING_ISO_DESIGNATION (coding, reg))
4348       break;
4349
4350   if (reg >= 4)
4351     {
4352       /* CHARSET is not yet designated to any graphic registers.  */
4353       /* At first check the requested designation.  */
4354       reg = CODING_ISO_REQUEST (coding, id);
4355       if (reg < 0)
4356         /* Since CHARSET requests no special designation, designate it
4357            to graphic register 0.  */
4358         reg = 0;
4359
4360       ENCODE_DESIGNATION (charset, reg, coding);
4361     }
4362
4363   if (CODING_ISO_INVOCATION (coding, 0) != reg
4364       && CODING_ISO_INVOCATION (coding, 1) != reg)
4365     {
4366       /* Since the graphic register REG is not invoked to any graphic
4367          planes, invoke it to graphic plane 0.  */
4368       switch (reg)
4369         {
4370         case 0:                 /* graphic register 0 */
4371           ENCODE_SHIFT_IN;
4372           break;
4373
4374         case 1:                 /* graphic register 1 */
4375           ENCODE_SHIFT_OUT;
4376           break;
4377
4378         case 2:                 /* graphic register 2 */
4379           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4380             ENCODE_SINGLE_SHIFT_2;
4381           else
4382             ENCODE_LOCKING_SHIFT_2;
4383           break;
4384
4385         case 3:                 /* graphic register 3 */
4386           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4387             ENCODE_SINGLE_SHIFT_3;
4388           else
4389             ENCODE_LOCKING_SHIFT_3;
4390           break;
4391         }
4392     }
4393
4394   *p_nchars = produced_chars;
4395   return dst;
4396 }
4397
4398 /* The following three macros produce codes for indicating direction
4399    of text.  */
4400 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4401   do {                                                                  \
4402     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4403       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4404     else                                                                \
4405       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4406   } while (0)
4407
4408
4409 #define ENCODE_DIRECTION_R2L()                  \
4410   do {                                          \
4411     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4412     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4413   } while (0)
4414
4415
4416 #define ENCODE_DIRECTION_L2R()                  \
4417   do {                                          \
4418     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4419     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4420   } while (0)
4421
4422
4423 /* Produce codes for designation and invocation to reset the graphic
4424    planes and registers to initial state.  */
4425 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4426   do {                                                                  \
4427     int reg;                                                            \
4428     struct charset *charset;                                            \
4429                                                                         \
4430     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4431       ENCODE_SHIFT_IN;                                                  \
4432     for (reg = 0; reg < 4; reg++)                                       \
4433       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4434           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4435               != CODING_ISO_INITIAL (coding, reg)))                     \
4436         {                                                               \
4437           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4438           ENCODE_DESIGNATION (charset, reg, coding);                    \
4439         }                                                               \
4440   } while (0)
4441
4442
4443 /* Produce designation sequences of charsets in the line started from
4444    SRC to a place pointed by DST, and return updated DST.
4445
4446    If the current block ends before any end-of-line, we may fail to
4447    find all the necessary designations.  */
4448
4449 static unsigned char *
4450 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4451      struct coding_system *coding;
4452      int *charbuf, *charbuf_end;
4453      unsigned char *dst;
4454 {
4455   struct charset *charset;
4456   /* Table of charsets to be designated to each graphic register.  */
4457   int r[4];
4458   int c, found = 0, reg;
4459   int produced_chars = 0;
4460   int multibytep = coding->dst_multibyte;
4461   Lisp_Object attrs;
4462   Lisp_Object charset_list;
4463
4464   attrs = CODING_ID_ATTRS (coding->id);
4465   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4466   if (EQ (charset_list, Qiso_2022))
4467     charset_list = Viso_2022_charset_list;
4468
4469   for (reg = 0; reg < 4; reg++)
4470     r[reg] = -1;
4471
4472   while (found < 4)
4473     {
4474       int id;
4475
4476       c = *charbuf++;
4477       if (c == '\n')
4478         break;
4479       charset = char_charset (c, charset_list, NULL);
4480       id = CHARSET_ID (charset);
4481       reg = CODING_ISO_REQUEST (coding, id);
4482       if (reg >= 0 && r[reg] < 0)
4483         {
4484           found++;
4485           r[reg] = id;
4486         }
4487     }
4488
4489   if (found)
4490     {
4491       for (reg = 0; reg < 4; reg++)
4492         if (r[reg] >= 0
4493             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4494           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4495     }
4496
4497   return dst;
4498 }
4499
4500 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4501
4502 static int
4503 encode_coding_iso_2022 (coding)
4504      struct coding_system *coding;
4505 {
4506   int multibytep = coding->dst_multibyte;
4507   int *charbuf = coding->charbuf;
4508   int *charbuf_end = charbuf + coding->charbuf_used;
4509   unsigned char *dst = coding->destination + coding->produced;
4510   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4511   int safe_room = 16;
4512   int bol_designation
4513     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4514        && CODING_ISO_BOL (coding));
4515   int produced_chars = 0;
4516   Lisp_Object attrs, eol_type, charset_list;
4517   int ascii_compatible;
4518   int c;
4519   int preferred_charset_id = -1;
4520
4521   CODING_GET_INFO (coding, attrs, charset_list);
4522   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4523   if (VECTORP (eol_type))
4524     eol_type = Qunix;
4525
4526   setup_iso_safe_charsets (attrs);
4527   /* Charset list may have been changed.  */
4528   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4529   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4530
4531   ascii_compatible
4532     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4533        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4534                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4535
4536   while (charbuf < charbuf_end)
4537     {
4538       ASSURE_DESTINATION (safe_room);
4539
4540       if (bol_designation)
4541         {
4542           unsigned char *dst_prev = dst;
4543
4544           /* We have to produce designation sequences if any now.  */
4545           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4546           bol_designation = 0;
4547           /* We are sure that designation sequences are all ASCII bytes.  */
4548           produced_chars += dst - dst_prev;
4549         }
4550
4551       c = *charbuf++;
4552
4553       if (c < 0)
4554         {
4555           /* Handle an annotation.  */
4556           switch (*charbuf)
4557             {
4558             case CODING_ANNOTATE_COMPOSITION_MASK:
4559               /* Not yet implemented.  */
4560               break;
4561             case CODING_ANNOTATE_CHARSET_MASK:
4562               preferred_charset_id = charbuf[2];
4563               if (preferred_charset_id >= 0
4564                   && NILP (Fmemq (make_number (preferred_charset_id),
4565                                   charset_list)))
4566                 preferred_charset_id = -1;
4567               break;
4568             default:
4569               abort ();
4570             }
4571           charbuf += -c - 1;
4572           continue;
4573         }
4574
4575       /* Now encode the character C.  */
4576       if (c < 0x20 || c == 0x7F)
4577         {
4578           if (c == '\n'
4579               || (c == '\r' && EQ (eol_type, Qmac)))
4580             {
4581               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4582                 ENCODE_RESET_PLANE_AND_REGISTER ();
4583               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4584                 {
4585                   int i;
4586
4587                   for (i = 0; i < 4; i++)
4588                     CODING_ISO_DESIGNATION (coding, i)
4589                       = CODING_ISO_INITIAL (coding, i);
4590                 }
4591               bol_designation
4592                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4593             }
4594           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4595             ENCODE_RESET_PLANE_AND_REGISTER ();
4596           EMIT_ONE_ASCII_BYTE (c);
4597         }
4598       else if (ASCII_CHAR_P (c))
4599         {
4600           if (ascii_compatible)
4601             EMIT_ONE_ASCII_BYTE (c);
4602           else
4603             {
4604               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4605               ENCODE_ISO_CHARACTER (charset, c);
4606             }
4607         }
4608       else if (CHAR_BYTE8_P (c))
4609         {
4610           c = CHAR_TO_BYTE8 (c);
4611           EMIT_ONE_BYTE (c);
4612         }
4613       else
4614         {
4615           struct charset *charset;
4616
4617           if (preferred_charset_id >= 0)
4618             {
4619               charset = CHARSET_FROM_ID (preferred_charset_id);
4620               if (! CHAR_CHARSET_P (c, charset))
4621                 charset = char_charset (c, charset_list, NULL);
4622             }
4623           else
4624             charset = char_charset (c, charset_list, NULL);
4625           if (!charset)
4626             {
4627               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4628                 {
4629                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4630                   charset = CHARSET_FROM_ID (charset_ascii);
4631                 }
4632               else
4633                 {
4634                   c = coding->default_char;
4635                   charset = char_charset (c, charset_list, NULL);
4636                 }
4637             }
4638           ENCODE_ISO_CHARACTER (charset, c);
4639         }
4640     }
4641
4642   if (coding->mode & CODING_MODE_LAST_BLOCK
4643       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4644     {
4645       ASSURE_DESTINATION (safe_room);
4646       ENCODE_RESET_PLANE_AND_REGISTER ();
4647     }
4648   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4649   CODING_ISO_BOL (coding) = bol_designation;
4650   coding->produced_char += produced_chars;
4651   coding->produced = dst - coding->destination;
4652   return 0;
4653 }
4654
4655 \f
4656 /*** 8,9. SJIS and BIG5 handlers ***/
4657
4658 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4659    quite widely.  So, for the moment, Emacs supports them in the bare
4660    C code.  But, in the future, they may be supported only by CCL.  */
4661
4662 /* SJIS is a coding system encoding three character sets: ASCII, right
4663    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4664    as is.  A character of charset katakana-jisx0201 is encoded by
4665    "position-code + 0x80".  A character of charset japanese-jisx0208
4666    is encoded in 2-byte but two position-codes are divided and shifted
4667    so that it fit in the range below.
4668
4669    --- CODE RANGE of SJIS ---
4670    (character set)      (range)
4671    ASCII                0x00 .. 0x7F
4672    KATAKANA-JISX0201    0xA0 .. 0xDF
4673    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4674             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4675    -------------------------------
4676
4677 */
4678
4679 /* BIG5 is a coding system encoding two character sets: ASCII and
4680    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4681    character set and is encoded in two-byte.
4682
4683    --- CODE RANGE of BIG5 ---
4684    (character set)      (range)
4685    ASCII                0x00 .. 0x7F
4686    Big5 (1st byte)      0xA1 .. 0xFE
4687         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4688    --------------------------
4689
4690   */
4691
4692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4693    Check if a text is encoded in SJIS.  If it is, return
4694    CATEGORY_MASK_SJIS, else return 0.  */
4695
4696 static int
4697 detect_coding_sjis (coding, detect_info)
4698      struct coding_system *coding;
4699      struct coding_detection_info *detect_info;
4700 {
4701   const unsigned char *src = coding->source, *src_base;
4702   const unsigned char *src_end = coding->source + coding->src_bytes;
4703   int multibytep = coding->src_multibyte;
4704   int consumed_chars = 0;
4705   int found = 0;
4706   int c;
4707   Lisp_Object attrs, charset_list;
4708   int max_first_byte_of_2_byte_code;
4709
4710   CODING_GET_INFO (coding, attrs, charset_list);
4711   max_first_byte_of_2_byte_code
4712     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4713
4714   detect_info->checked |= CATEGORY_MASK_SJIS;
4715   /* A coding system of this category is always ASCII compatible.  */
4716   src += coding->head_ascii;
4717
4718   while (1)
4719     {
4720       src_base = src;
4721       ONE_MORE_BYTE (c);
4722       if (c < 0x80)
4723         continue;
4724       if ((c >= 0x81 && c <= 0x9F)
4725           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4726         {
4727           ONE_MORE_BYTE (c);
4728           if (c < 0x40 || c == 0x7F || c > 0xFC)
4729             break;
4730           found = CATEGORY_MASK_SJIS;
4731         }
4732       else if (c >= 0xA0 && c < 0xE0)
4733         found = CATEGORY_MASK_SJIS;
4734       else
4735         break;
4736     }
4737   detect_info->rejected |= CATEGORY_MASK_SJIS;
4738   return 0;
4739
4740  no_more_source:
4741   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4742     {
4743       detect_info->rejected |= CATEGORY_MASK_SJIS;
4744       return 0;
4745     }
4746   detect_info->found |= found;
4747   return 1;
4748 }
4749
4750 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4751    Check if a text is encoded in BIG5.  If it is, return
4752    CATEGORY_MASK_BIG5, else return 0.  */
4753
4754 static int
4755 detect_coding_big5 (coding, detect_info)
4756      struct coding_system *coding;
4757      struct coding_detection_info *detect_info;
4758 {
4759   const unsigned char *src = coding->source, *src_base;
4760   const unsigned char *src_end = coding->source + coding->src_bytes;
4761   int multibytep = coding->src_multibyte;
4762   int consumed_chars = 0;
4763   int found = 0;
4764   int c;
4765
4766   detect_info->checked |= CATEGORY_MASK_BIG5;
4767   /* A coding system of this category is always ASCII compatible.  */
4768   src += coding->head_ascii;
4769
4770   while (1)
4771     {
4772       src_base = src;
4773       ONE_MORE_BYTE (c);
4774       if (c < 0x80)
4775         continue;
4776       if (c >= 0xA1)
4777         {
4778           ONE_MORE_BYTE (c);
4779           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4780             return 0;
4781           found = CATEGORY_MASK_BIG5;
4782         }
4783       else
4784         break;
4785     }
4786   detect_info->rejected |= CATEGORY_MASK_BIG5;
4787   return 0;
4788
4789  no_more_source:
4790   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4791     {
4792       detect_info->rejected |= CATEGORY_MASK_BIG5;
4793       return 0;
4794     }
4795   detect_info->found |= found;
4796   return 1;
4797 }
4798
4799 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4800    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4801
4802 static void
4803 decode_coding_sjis (coding)
4804      struct coding_system *coding;
4805 {
4806   const unsigned char *src = coding->source + coding->consumed;
4807   const unsigned char *src_end = coding->source + coding->src_bytes;
4808   const unsigned char *src_base;
4809   int *charbuf = coding->charbuf + coding->charbuf_used;
4810   /* We may produce one charset annocation in one loop and one more at
4811      the end.  */
4812   int *charbuf_end
4813     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4814   int consumed_chars = 0, consumed_chars_base;
4815   int multibytep = coding->src_multibyte;
4816   struct charset *charset_roman, *charset_kanji, *charset_kana;
4817   struct charset *charset_kanji2;
4818   Lisp_Object attrs, charset_list, val;
4819   int char_offset = coding->produced_char;
4820   int last_offset = char_offset;
4821   int last_id = charset_ascii;
4822   int eol_crlf =
4823     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4824   int byte_after_cr = -1;
4825
4826   CODING_GET_INFO (coding, attrs, charset_list);
4827
4828   val = charset_list;
4829   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4830   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4831   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4832   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4833
4834   while (1)
4835     {
4836       int c, c1;
4837       struct charset *charset;
4838
4839       src_base = src;
4840       consumed_chars_base = consumed_chars;
4841
4842       if (charbuf >= charbuf_end)
4843         {
4844           if (byte_after_cr >= 0)
4845             src_base--;
4846           break;
4847         }
4848
4849       if (byte_after_cr >= 0)
4850         c = byte_after_cr, byte_after_cr = -1;
4851       else
4852         ONE_MORE_BYTE (c);
4853       if (c < 0)
4854         goto invalid_code;
4855       if (c < 0x80)
4856         {
4857           if (eol_crlf && c == '\r')
4858             ONE_MORE_BYTE (byte_after_cr);
4859           charset = charset_roman;
4860         }
4861       else if (c == 0x80 || c == 0xA0)
4862         goto invalid_code;
4863       else if (c >= 0xA1 && c <= 0xDF)
4864         {
4865           /* SJIS -> JISX0201-Kana */
4866           c &= 0x7F;
4867           charset = charset_kana;
4868         }
4869       else if (c <= 0xEF)
4870         {
4871           /* SJIS -> JISX0208 */
4872           ONE_MORE_BYTE (c1);
4873           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4874             goto invalid_code;
4875           c = (c << 8) | c1;
4876           SJIS_TO_JIS (c);
4877           charset = charset_kanji;
4878         }
4879       else if (c <= 0xFC && charset_kanji2)
4880         {
4881           /* SJIS -> JISX0213-2 */
4882           ONE_MORE_BYTE (c1);
4883           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4884             goto invalid_code;
4885           c = (c << 8) | c1;
4886           SJIS_TO_JIS2 (c);
4887           charset = charset_kanji2;
4888         }
4889       else
4890         goto invalid_code;
4891       if (charset->id != charset_ascii
4892           && last_id != charset->id)
4893         {
4894           if (last_id != charset_ascii)
4895             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4896           last_id = charset->id;
4897           last_offset = char_offset;
4898         }
4899       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4900       *charbuf++ = c;
4901       char_offset++;
4902       continue;
4903
4904     invalid_code:
4905       src = src_base;
4906       consumed_chars = consumed_chars_base;
4907       ONE_MORE_BYTE (c);
4908       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4909       char_offset++;
4910       coding->errors++;
4911     }
4912
4913  no_more_source:
4914   if (last_id != charset_ascii)
4915     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4916   coding->consumed_char += consumed_chars_base;
4917   coding->consumed = src_base - coding->source;
4918   coding->charbuf_used = charbuf - coding->charbuf;
4919 }
4920
4921 static void
4922 decode_coding_big5 (coding)
4923      struct coding_system *coding;
4924 {
4925   const unsigned char *src = coding->source + coding->consumed;
4926   const unsigned char *src_end = coding->source + coding->src_bytes;
4927   const unsigned char *src_base;
4928   int *charbuf = coding->charbuf + coding->charbuf_used;
4929   /* We may produce one charset annocation in one loop and one more at
4930      the end.  */
4931   int *charbuf_end
4932     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4933   int consumed_chars = 0, consumed_chars_base;
4934   int multibytep = coding->src_multibyte;
4935   struct charset *charset_roman, *charset_big5;
4936   Lisp_Object attrs, charset_list, val;
4937   int char_offset = coding->produced_char;
4938   int last_offset = char_offset;
4939   int last_id = charset_ascii;
4940   int eol_crlf =
4941     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4942   int byte_after_cr = -1;
4943
4944   CODING_GET_INFO (coding, attrs, charset_list);
4945   val = charset_list;
4946   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4947   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4948
4949   while (1)
4950     {
4951       int c, c1;
4952       struct charset *charset;
4953
4954       src_base = src;
4955       consumed_chars_base = consumed_chars;
4956
4957       if (charbuf >= charbuf_end)
4958         {
4959           if (byte_after_cr >= 0)
4960             src_base--;
4961           break;
4962         }
4963
4964       if (byte_after_cr >= 0)
4965         c = byte_after_cr, byte_after_cr = -1;
4966       else
4967         ONE_MORE_BYTE (c);
4968
4969       if (c < 0)
4970         goto invalid_code;
4971       if (c < 0x80)
4972         {
4973           if (eol_crlf && c == '\r')
4974             ONE_MORE_BYTE (byte_after_cr);
4975           charset = charset_roman;
4976         }
4977       else
4978         {
4979           /* BIG5 -> Big5 */
4980           if (c < 0xA1 || c > 0xFE)
4981             goto invalid_code;
4982           ONE_MORE_BYTE (c1);
4983           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4984             goto invalid_code;
4985           c = c << 8 | c1;
4986           charset = charset_big5;
4987         }
4988       if (charset->id != charset_ascii
4989           && last_id != charset->id)
4990         {
4991           if (last_id != charset_ascii)
4992             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4993           last_id = charset->id;
4994           last_offset = char_offset;
4995         }
4996       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4997       *charbuf++ = c;
4998       char_offset++;
4999       continue;
5000
5001     invalid_code:
5002       src = src_base;
5003       consumed_chars = consumed_chars_base;
5004       ONE_MORE_BYTE (c);
5005       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
5006       char_offset++;
5007       coding->errors++;
5008     }
5009
5010  no_more_source:
5011   if (last_id != charset_ascii)
5012     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5013   coding->consumed_char += consumed_chars_base;
5014   coding->consumed = src_base - coding->source;
5015   coding->charbuf_used = charbuf - coding->charbuf;
5016 }
5017
5018 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
5019    This function can encode charsets `ascii', `katakana-jisx0201',
5020    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
5021    are sure that all these charsets are registered as official charset
5022    (i.e. do not have extended leading-codes).  Characters of other
5023    charsets are produced without any encoding.  If SJIS_P is 1, encode
5024    SJIS text, else encode BIG5 text.  */
5025
5026 static int
5027 encode_coding_sjis (coding)
5028      struct coding_system *coding;
5029 {
5030   int multibytep = coding->dst_multibyte;
5031   int *charbuf = coding->charbuf;
5032   int *charbuf_end = charbuf + coding->charbuf_used;
5033   unsigned char *dst = coding->destination + coding->produced;
5034   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5035   int safe_room = 4;
5036   int produced_chars = 0;
5037   Lisp_Object attrs, charset_list, val;
5038   int ascii_compatible;
5039   struct charset *charset_roman, *charset_kanji, *charset_kana;
5040   struct charset *charset_kanji2;
5041   int c;
5042
5043   CODING_GET_INFO (coding, attrs, charset_list);
5044   val = charset_list;
5045   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5046   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5047   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5048   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5049
5050   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5051
5052   while (charbuf < charbuf_end)
5053     {
5054       ASSURE_DESTINATION (safe_room);
5055       c = *charbuf++;
5056       /* Now encode the character C.  */
5057       if (ASCII_CHAR_P (c) && ascii_compatible)
5058         EMIT_ONE_ASCII_BYTE (c);
5059       else if (CHAR_BYTE8_P (c))
5060         {
5061           c = CHAR_TO_BYTE8 (c);
5062           EMIT_ONE_BYTE (c);
5063         }
5064       else
5065         {
5066           unsigned code;
5067           struct charset *charset = char_charset (c, charset_list, &code);
5068
5069           if (!charset)
5070             {
5071               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5072                 {
5073                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5074                   charset = CHARSET_FROM_ID (charset_ascii);
5075                 }
5076               else
5077                 {
5078                   c = coding->default_char;
5079                   charset = char_charset (c, charset_list, &code);
5080                 }
5081             }
5082           if (code == CHARSET_INVALID_CODE (charset))
5083             abort ();
5084           if (charset == charset_kanji)
5085             {
5086               int c1, c2;
5087               JIS_TO_SJIS (code);
5088               c1 = code >> 8, c2 = code & 0xFF;
5089               EMIT_TWO_BYTES (c1, c2);
5090             }
5091           else if (charset == charset_kana)
5092             EMIT_ONE_BYTE (code | 0x80);
5093           else if (charset_kanji2 && charset == charset_kanji2)
5094             {
5095               int c1, c2;
5096
5097               c1 = code >> 8;
5098               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5099                   || c1 == 0x28
5100                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5101                 {
5102                   JIS_TO_SJIS2 (code);
5103                   c1 = code >> 8, c2 = code & 0xFF;
5104                   EMIT_TWO_BYTES (c1, c2);
5105                 }
5106               else
5107                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5108             }
5109           else
5110             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5111         }
5112     }
5113   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5114   coding->produced_char += produced_chars;
5115   coding->produced = dst - coding->destination;
5116   return 0;
5117 }
5118
5119 static int
5120 encode_coding_big5 (coding)
5121      struct coding_system *coding;
5122 {
5123   int multibytep = coding->dst_multibyte;
5124   int *charbuf = coding->charbuf;
5125   int *charbuf_end = charbuf + coding->charbuf_used;
5126   unsigned char *dst = coding->destination + coding->produced;
5127   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5128   int safe_room = 4;
5129   int produced_chars = 0;
5130   Lisp_Object attrs, charset_list, val;
5131   int ascii_compatible;
5132   struct charset *charset_roman, *charset_big5;
5133   int c;
5134
5135   CODING_GET_INFO (coding, attrs, charset_list);
5136   val = charset_list;
5137   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5138   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5139   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5140
5141   while (charbuf < charbuf_end)
5142     {
5143       ASSURE_DESTINATION (safe_room);
5144       c = *charbuf++;
5145       /* Now encode the character C.  */
5146       if (ASCII_CHAR_P (c) && ascii_compatible)
5147         EMIT_ONE_ASCII_BYTE (c);
5148       else if (CHAR_BYTE8_P (c))
5149         {
5150           c = CHAR_TO_BYTE8 (c);
5151           EMIT_ONE_BYTE (c);
5152         }
5153       else
5154         {
5155           unsigned code;
5156           struct charset *charset = char_charset (c, charset_list, &code);
5157
5158           if (! charset)
5159             {
5160               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5161                 {
5162                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5163                   charset = CHARSET_FROM_ID (charset_ascii);
5164                 }
5165               else
5166                 {
5167                   c = coding->default_char;
5168                   charset = char_charset (c, charset_list, &code);
5169                 }
5170             }
5171           if (code == CHARSET_INVALID_CODE (charset))
5172             abort ();
5173           if (charset == charset_big5)
5174             {
5175               int c1, c2;
5176
5177               c1 = code >> 8, c2 = code & 0xFF;
5178               EMIT_TWO_BYTES (c1, c2);
5179             }
5180           else
5181             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5182         }
5183     }
5184   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5185   coding->produced_char += produced_chars;
5186   coding->produced = dst - coding->destination;
5187   return 0;
5188 }
5189
5190 \f
5191 /*** 10. CCL handlers ***/
5192
5193 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5194    Check if a text is encoded in a coding system of which
5195    encoder/decoder are written in CCL program.  If it is, return
5196    CATEGORY_MASK_CCL, else return 0.  */
5197
5198 static int
5199 detect_coding_ccl (coding, detect_info)
5200      struct coding_system *coding;
5201      struct coding_detection_info *detect_info;
5202 {
5203   const unsigned char *src = coding->source, *src_base;
5204   const unsigned char *src_end = coding->source + coding->src_bytes;
5205   int multibytep = coding->src_multibyte;
5206   int consumed_chars = 0;
5207   int found = 0;
5208   unsigned char *valids;
5209   int head_ascii = coding->head_ascii;
5210   Lisp_Object attrs;
5211
5212   detect_info->checked |= CATEGORY_MASK_CCL;
5213
5214   coding = &coding_categories[coding_category_ccl];
5215   valids = CODING_CCL_VALIDS (coding);
5216   attrs = CODING_ID_ATTRS (coding->id);
5217   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5218     src += head_ascii;
5219
5220   while (1)
5221     {
5222       int c;
5223
5224       src_base = src;
5225       ONE_MORE_BYTE (c);
5226       if (c < 0 || ! valids[c])
5227         break;
5228       if ((valids[c] > 1))
5229         found = CATEGORY_MASK_CCL;
5230     }
5231   detect_info->rejected |= CATEGORY_MASK_CCL;
5232   return 0;
5233
5234  no_more_source:
5235   detect_info->found |= found;
5236   return 1;
5237 }
5238
5239 static void
5240 decode_coding_ccl (coding)
5241      struct coding_system *coding;
5242 {
5243   const unsigned char *src = coding->source + coding->consumed;
5244   const unsigned char *src_end = coding->source + coding->src_bytes;
5245   int *charbuf = coding->charbuf + coding->charbuf_used;
5246   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5247   int consumed_chars = 0;
5248   int multibytep = coding->src_multibyte;
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   int source_charbuf[1024];
5251   int source_byteidx[1025];
5252   Lisp_Object attrs, charset_list;
5253
5254   CODING_GET_INFO (coding, attrs, charset_list);
5255
5256   while (1)
5257     {
5258       const unsigned char *p = src;
5259       int i = 0;
5260
5261       if (multibytep)
5262         {
5263           while (i < 1024 && p < src_end)
5264             {
5265               source_byteidx[i] = p - src;
5266               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5267             }
5268           source_byteidx[i] = p - src;
5269         }
5270       else
5271         while (i < 1024 && p < src_end)
5272           source_charbuf[i++] = *p++;
5273
5274       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5275         ccl->last_block = 1;
5276       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5277                   charset_list);
5278       charbuf += ccl->produced;
5279       if (multibytep)
5280         src += source_byteidx[ccl->consumed];
5281       else
5282         src += ccl->consumed;
5283       consumed_chars += ccl->consumed;
5284       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5285         break;
5286     }
5287
5288   switch (ccl->status)
5289     {
5290     case CCL_STAT_SUSPEND_BY_SRC:
5291       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5292       break;
5293     case CCL_STAT_SUSPEND_BY_DST:
5294       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5295       break;
5296     case CCL_STAT_QUIT:
5297     case CCL_STAT_INVALID_CMD:
5298       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5299       break;
5300     default:
5301       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5302       break;
5303     }
5304   coding->consumed_char += consumed_chars;
5305   coding->consumed = src - coding->source;
5306   coding->charbuf_used = charbuf - coding->charbuf;
5307 }
5308
5309 static int
5310 encode_coding_ccl (coding)
5311      struct coding_system *coding;
5312 {
5313   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5314   int multibytep = coding->dst_multibyte;
5315   int *charbuf = coding->charbuf;
5316   int *charbuf_end = charbuf + coding->charbuf_used;
5317   unsigned char *dst = coding->destination + coding->produced;
5318   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5319   int destination_charbuf[1024];
5320   int i, produced_chars = 0;
5321   Lisp_Object attrs, charset_list;
5322
5323   CODING_GET_INFO (coding, attrs, charset_list);
5324   if (coding->consumed_char == coding->src_chars
5325       && coding->mode & CODING_MODE_LAST_BLOCK)
5326     ccl->last_block = 1;
5327
5328   while (charbuf < charbuf_end)
5329     {
5330       ccl_driver (ccl, charbuf, destination_charbuf,
5331                   charbuf_end - charbuf, 1024, charset_list);
5332       if (multibytep)
5333         {
5334           ASSURE_DESTINATION (ccl->produced * 2);
5335           for (i = 0; i < ccl->produced; i++)
5336             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5337         }
5338       else
5339         {
5340           ASSURE_DESTINATION (ccl->produced);
5341           for (i = 0; i < ccl->produced; i++)
5342             *dst++ = destination_charbuf[i] & 0xFF;
5343           produced_chars += ccl->produced;
5344         }
5345       charbuf += ccl->consumed;
5346       if (ccl->status == CCL_STAT_QUIT
5347           || ccl->status == CCL_STAT_INVALID_CMD)
5348         break;
5349     }
5350
5351   switch (ccl->status)
5352     {
5353     case CCL_STAT_SUSPEND_BY_SRC:
5354       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5355       break;
5356     case CCL_STAT_SUSPEND_BY_DST:
5357       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5358       break;
5359     case CCL_STAT_QUIT:
5360     case CCL_STAT_INVALID_CMD:
5361       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5362       break;
5363     default:
5364       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5365       break;
5366     }
5367
5368   coding->produced_char += produced_chars;
5369   coding->produced = dst - coding->destination;
5370   return 0;
5371 }
5372
5373
5374 \f
5375 /*** 10, 11. no-conversion handlers ***/
5376
5377 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5378
5379 static void
5380 decode_coding_raw_text (coding)
5381      struct coding_system *coding;
5382 {
5383   int eol_crlf =
5384     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5385
5386   coding->chars_at_source = 1;
5387   coding->consumed_char = coding->src_chars;
5388   coding->consumed = coding->src_bytes;
5389   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5390     {
5391       coding->consumed_char--;
5392       coding->consumed--;
5393       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5394     }
5395   else
5396     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5397 }
5398
5399 static int
5400 encode_coding_raw_text (coding)
5401      struct coding_system *coding;
5402 {
5403   int multibytep = coding->dst_multibyte;
5404   int *charbuf = coding->charbuf;
5405   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5406   unsigned char *dst = coding->destination + coding->produced;
5407   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5408   int produced_chars = 0;
5409   int c;
5410
5411   if (multibytep)
5412     {
5413       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5414
5415       if (coding->src_multibyte)
5416         while (charbuf < charbuf_end)
5417           {
5418             ASSURE_DESTINATION (safe_room);
5419             c = *charbuf++;
5420             if (ASCII_CHAR_P (c))
5421               EMIT_ONE_ASCII_BYTE (c);
5422             else if (CHAR_BYTE8_P (c))
5423               {
5424                 c = CHAR_TO_BYTE8 (c);
5425                 EMIT_ONE_BYTE (c);
5426               }
5427             else
5428               {
5429                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5430
5431                 CHAR_STRING_ADVANCE (c, p1);
5432                 while (p0 < p1)
5433                   {
5434                     EMIT_ONE_BYTE (*p0);
5435                     p0++;
5436                   }
5437               }
5438           }
5439       else
5440         while (charbuf < charbuf_end)
5441           {
5442             ASSURE_DESTINATION (safe_room);
5443             c = *charbuf++;
5444             EMIT_ONE_BYTE (c);
5445           }
5446     }
5447   else
5448     {
5449       if (coding->src_multibyte)
5450         {
5451           int safe_room = MAX_MULTIBYTE_LENGTH;
5452
5453           while (charbuf < charbuf_end)
5454             {
5455               ASSURE_DESTINATION (safe_room);
5456               c = *charbuf++;
5457               if (ASCII_CHAR_P (c))
5458                 *dst++ = c;
5459               else if (CHAR_BYTE8_P (c))
5460                 *dst++ = CHAR_TO_BYTE8 (c);
5461               else
5462                 CHAR_STRING_ADVANCE (c, dst);
5463             }
5464         }
5465       else
5466         {
5467           ASSURE_DESTINATION (charbuf_end - charbuf);
5468           while (charbuf < charbuf_end && dst < dst_end)
5469             *dst++ = *charbuf++;
5470         }
5471       produced_chars = dst - (coding->destination + coding->produced);
5472     }
5473   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5474   coding->produced_char += produced_chars;
5475   coding->produced = dst - coding->destination;
5476   return 0;
5477 }
5478
5479 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5480    Check if a text is encoded in a charset-based coding system.  If it
5481    is, return 1, else return 0.  */
5482
5483 static int
5484 detect_coding_charset (coding, detect_info)
5485      struct coding_system *coding;
5486      struct coding_detection_info *detect_info;
5487 {
5488   const unsigned char *src = coding->source, *src_base;
5489   const unsigned char *src_end = coding->source + coding->src_bytes;
5490   int multibytep = coding->src_multibyte;
5491   int consumed_chars = 0;
5492   Lisp_Object attrs, valids, name;
5493   int found = 0;
5494   int head_ascii = coding->head_ascii;
5495   int check_latin_extra = 0;
5496
5497   detect_info->checked |= CATEGORY_MASK_CHARSET;
5498
5499   coding = &coding_categories[coding_category_charset];
5500   attrs = CODING_ID_ATTRS (coding->id);
5501   valids = AREF (attrs, coding_attr_charset_valids);
5502   name = CODING_ID_NAME (coding->id);
5503   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5504                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5505       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5506                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5507     check_latin_extra = 1;
5508
5509   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5510     src += head_ascii;
5511
5512   while (1)
5513     {
5514       int c;
5515       Lisp_Object val;
5516       struct charset *charset;
5517       int dim, idx;
5518
5519       src_base = src;
5520       ONE_MORE_BYTE (c);
5521       if (c < 0)
5522         continue;
5523       val = AREF (valids, c);
5524       if (NILP (val))
5525         break;
5526       if (c >= 0x80)
5527         {
5528           if (c < 0xA0
5529               && check_latin_extra
5530               && (!VECTORP (Vlatin_extra_code_table)
5531                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5532             break;
5533           found = CATEGORY_MASK_CHARSET;
5534         }
5535       if (INTEGERP (val))
5536         {
5537           charset = CHARSET_FROM_ID (XFASTINT (val));
5538           dim = CHARSET_DIMENSION (charset);
5539           for (idx = 1; idx < dim; idx++)
5540             {
5541               if (src == src_end)
5542                 goto too_short;
5543               ONE_MORE_BYTE (c);
5544               if (c < charset->code_space[(dim - 1 - idx) * 2]
5545                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5546                 break;
5547             }
5548           if (idx < dim)
5549             break;
5550         }
5551       else
5552         {
5553           idx = 1;
5554           for (; CONSP (val); val = XCDR (val))
5555             {
5556               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5557               dim = CHARSET_DIMENSION (charset);
5558               while (idx < dim)
5559                 {
5560                   if (src == src_end)
5561                     goto too_short;
5562                   ONE_MORE_BYTE (c);
5563                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5564                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5565                     break;
5566                   idx++;
5567                 }
5568               if (idx == dim)
5569                 {
5570                   val = Qnil;
5571                   break;
5572                 }
5573             }
5574           if (CONSP (val))
5575             break;
5576         }
5577     }
5578  too_short:
5579   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5580   return 0;
5581
5582  no_more_source:
5583   detect_info->found |= found;
5584   return 1;
5585 }
5586
5587 static void
5588 decode_coding_charset (coding)
5589      struct coding_system *coding;
5590 {
5591   const unsigned char *src = coding->source + coding->consumed;
5592   const unsigned char *src_end = coding->source + coding->src_bytes;
5593   const unsigned char *src_base;
5594   int *charbuf = coding->charbuf + coding->charbuf_used;
5595   /* We may produce one charset annocation in one loop and one more at
5596      the end.  */
5597   int *charbuf_end
5598     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5599   int consumed_chars = 0, consumed_chars_base;
5600   int multibytep = coding->src_multibyte;
5601   Lisp_Object attrs, charset_list, valids;
5602   int char_offset = coding->produced_char;
5603   int last_offset = char_offset;
5604   int last_id = charset_ascii;
5605   int eol_crlf =
5606     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5607   int byte_after_cr = -1;
5608
5609   CODING_GET_INFO (coding, attrs, charset_list);
5610   valids = AREF (attrs, coding_attr_charset_valids);
5611
5612   while (1)
5613     {
5614       int c;
5615       Lisp_Object val;
5616       struct charset *charset;
5617       int dim;
5618       int len = 1;
5619       unsigned code;
5620
5621       src_base = src;
5622       consumed_chars_base = consumed_chars;
5623
5624       if (charbuf >= charbuf_end)
5625         {
5626           if (byte_after_cr >= 0)
5627             src_base--;
5628           break;
5629         }
5630
5631       if (byte_after_cr >= 0)
5632         {
5633           c = byte_after_cr;
5634           byte_after_cr = -1;
5635         }
5636       else
5637         {
5638           ONE_MORE_BYTE (c);
5639           if (eol_crlf && c == '\r')
5640             ONE_MORE_BYTE (byte_after_cr);
5641         }
5642       if (c < 0)
5643         goto invalid_code;
5644       code = c;
5645
5646       val = AREF (valids, c);
5647       if (! INTEGERP (val) && ! CONSP (val))
5648         goto invalid_code;
5649       if (INTEGERP (val))
5650         {
5651           charset = CHARSET_FROM_ID (XFASTINT (val));
5652           dim = CHARSET_DIMENSION (charset);
5653           while (len < dim)
5654             {
5655               ONE_MORE_BYTE (c);
5656               code = (code << 8) | c;
5657               len++;
5658             }
5659           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5660                               charset, code, c);
5661         }
5662       else
5663         {
5664           /* VAL is a list of charset IDs.  It is assured that the
5665              list is sorted by charset dimensions (smaller one
5666              comes first).  */
5667           while (CONSP (val))
5668             {
5669               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5670               dim = CHARSET_DIMENSION (charset);
5671               while (len < dim)
5672                 {
5673                   ONE_MORE_BYTE (c);
5674                   code = (code << 8) | c;
5675                   len++;
5676                 }
5677               CODING_DECODE_CHAR (coding, src, src_base,
5678                                   src_end, charset, code, c);
5679               if (c >= 0)
5680                 break;
5681               val = XCDR (val);
5682             }
5683         }
5684       if (c < 0)
5685         goto invalid_code;
5686       if (charset->id != charset_ascii
5687           && last_id != charset->id)
5688         {
5689           if (last_id != charset_ascii)
5690             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5691           last_id = charset->id;
5692           last_offset = char_offset;
5693         }
5694
5695       *charbuf++ = c;
5696       char_offset++;
5697       continue;
5698
5699     invalid_code:
5700       src = src_base;
5701       consumed_chars = consumed_chars_base;
5702       ONE_MORE_BYTE (c);
5703       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5704       char_offset++;
5705       coding->errors++;
5706     }
5707
5708  no_more_source:
5709   if (last_id != charset_ascii)
5710     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5711   coding->consumed_char += consumed_chars_base;
5712   coding->consumed = src_base - coding->source;
5713   coding->charbuf_used = charbuf - coding->charbuf;
5714 }
5715
5716 static int
5717 encode_coding_charset (coding)
5718      struct coding_system *coding;
5719 {
5720   int multibytep = coding->dst_multibyte;
5721   int *charbuf = coding->charbuf;
5722   int *charbuf_end = charbuf + coding->charbuf_used;
5723   unsigned char *dst = coding->destination + coding->produced;
5724   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5725   int safe_room = MAX_MULTIBYTE_LENGTH;
5726   int produced_chars = 0;
5727   Lisp_Object attrs, charset_list;
5728   int ascii_compatible;
5729   int c;
5730
5731   CODING_GET_INFO (coding, attrs, charset_list);
5732   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5733
5734   while (charbuf < charbuf_end)
5735     {
5736       struct charset *charset;
5737       unsigned code;
5738
5739       ASSURE_DESTINATION (safe_room);
5740       c = *charbuf++;
5741       if (ascii_compatible && ASCII_CHAR_P (c))
5742         EMIT_ONE_ASCII_BYTE (c);
5743       else if (CHAR_BYTE8_P (c))
5744         {
5745           c = CHAR_TO_BYTE8 (c);
5746           EMIT_ONE_BYTE (c);
5747         }
5748       else
5749         {
5750           charset = char_charset (c, charset_list, &code);
5751           if (charset)
5752             {
5753               if (CHARSET_DIMENSION (charset) == 1)
5754                 EMIT_ONE_BYTE (code);
5755               else if (CHARSET_DIMENSION (charset) == 2)
5756                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5757               else if (CHARSET_DIMENSION (charset) == 3)
5758                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5759               else
5760                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5761                                  (code >> 8) & 0xFF, code & 0xFF);
5762             }
5763           else
5764             {
5765               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5766                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5767               else
5768                 c = coding->default_char;
5769               EMIT_ONE_BYTE (c);
5770             }
5771         }
5772     }
5773
5774   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5775   coding->produced_char += produced_chars;
5776   coding->produced = dst - coding->destination;
5777   return 0;
5778 }
5779
5780 \f
5781 /*** 7. C library functions ***/
5782
5783 /* Setup coding context CODING from information about CODING_SYSTEM.
5784    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5785    CODING_SYSTEM is invalid, signal an error.  */
5786
5787 void
5788 setup_coding_system (coding_system, coding)
5789      Lisp_Object coding_system;
5790      struct coding_system *coding;
5791 {
5792   Lisp_Object attrs;
5793   Lisp_Object eol_type;
5794   Lisp_Object coding_type;
5795   Lisp_Object val;
5796
5797   if (NILP (coding_system))
5798     coding_system = Qundecided;
5799
5800   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5801
5802   attrs = CODING_ID_ATTRS (coding->id);
5803   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5804
5805   coding->mode = 0;
5806   coding->head_ascii = -1;
5807   if (VECTORP (eol_type))
5808     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5809                             | CODING_REQUIRE_DETECTION_MASK);
5810   else if (! EQ (eol_type, Qunix))
5811     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5812                             | CODING_REQUIRE_ENCODING_MASK);
5813   else
5814     coding->common_flags = 0;
5815   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5816     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5817   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5818     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5819   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5820     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5821
5822   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5823   coding->max_charset_id = SCHARS (val) - 1;
5824   coding->safe_charsets = SDATA (val);
5825   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5826   coding->carryover_bytes = 0;
5827
5828   coding_type = CODING_ATTR_TYPE (attrs);
5829   if (EQ (coding_type, Qundecided))
5830     {
5831       coding->detector = NULL;
5832       coding->decoder = decode_coding_raw_text;
5833       coding->encoder = encode_coding_raw_text;
5834       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5835     }
5836   else if (EQ (coding_type, Qiso_2022))
5837     {
5838       int i;
5839       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5840
5841       /* Invoke graphic register 0 to plane 0.  */
5842       CODING_ISO_INVOCATION (coding, 0) = 0;
5843       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5844       CODING_ISO_INVOCATION (coding, 1)
5845         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5846       /* Setup the initial status of designation.  */
5847       for (i = 0; i < 4; i++)
5848         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5849       /* Not single shifting initially.  */
5850       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5851       /* Beginning of buffer should also be regarded as bol. */
5852       CODING_ISO_BOL (coding) = 1;
5853       coding->detector = detect_coding_iso_2022;
5854       coding->decoder = decode_coding_iso_2022;
5855       coding->encoder = encode_coding_iso_2022;
5856       if (flags & CODING_ISO_FLAG_SAFE)
5857         coding->mode |= CODING_MODE_SAFE_ENCODING;
5858       coding->common_flags
5859         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5860             | CODING_REQUIRE_FLUSHING_MASK);
5861       if (flags & CODING_ISO_FLAG_COMPOSITION)
5862         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5863       if (flags & CODING_ISO_FLAG_DESIGNATION)
5864         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5865       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5866         {
5867           setup_iso_safe_charsets (attrs);
5868           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5869           coding->max_charset_id = SCHARS (val) - 1;
5870           coding->safe_charsets = SDATA (val);
5871         }
5872       CODING_ISO_FLAGS (coding) = flags;
5873       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5874       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5875       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5876       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5877     }
5878   else if (EQ (coding_type, Qcharset))
5879     {
5880       coding->detector = detect_coding_charset;
5881       coding->decoder = decode_coding_charset;
5882       coding->encoder = encode_coding_charset;
5883       coding->common_flags
5884         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5885     }
5886   else if (EQ (coding_type, Qutf_8))
5887     {
5888       val = AREF (attrs, coding_attr_utf_bom);
5889       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5890                                    : EQ (val, Qt) ? utf_with_bom
5891                                    : utf_without_bom);
5892       coding->detector = detect_coding_utf_8;
5893       coding->decoder = decode_coding_utf_8;
5894       coding->encoder = encode_coding_utf_8;
5895       coding->common_flags
5896         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5897       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5898         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5899     }
5900   else if (EQ (coding_type, Qutf_16))
5901     {
5902       val = AREF (attrs, coding_attr_utf_bom);
5903       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5904                                     : EQ (val, Qt) ? utf_with_bom
5905                                     : utf_without_bom);
5906       val = AREF (attrs, coding_attr_utf_16_endian);
5907       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5908                                        : utf_16_little_endian);
5909       CODING_UTF_16_SURROGATE (coding) = 0;
5910       coding->detector = detect_coding_utf_16;
5911       coding->decoder = decode_coding_utf_16;
5912       coding->encoder = encode_coding_utf_16;
5913       coding->common_flags
5914         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5915       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5916         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5917     }
5918   else if (EQ (coding_type, Qccl))
5919     {
5920       coding->detector = detect_coding_ccl;
5921       coding->decoder = decode_coding_ccl;
5922       coding->encoder = encode_coding_ccl;
5923       coding->common_flags
5924         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5925             | CODING_REQUIRE_FLUSHING_MASK);
5926     }
5927   else if (EQ (coding_type, Qemacs_mule))
5928     {
5929       coding->detector = detect_coding_emacs_mule;
5930       coding->decoder = decode_coding_emacs_mule;
5931       coding->encoder = encode_coding_emacs_mule;
5932       coding->common_flags
5933         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5934       coding->spec.emacs_mule.full_support = 1;
5935       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5936           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5937         {
5938           Lisp_Object tail, safe_charsets;
5939           int max_charset_id = 0;
5940
5941           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5942                tail = XCDR (tail))
5943             if (max_charset_id < XFASTINT (XCAR (tail)))
5944               max_charset_id = XFASTINT (XCAR (tail));
5945           safe_charsets = make_uninit_string (max_charset_id + 1);
5946           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5947           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5948                tail = XCDR (tail))
5949             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5950           coding->max_charset_id = max_charset_id;
5951           coding->safe_charsets = SDATA (safe_charsets);
5952           coding->spec.emacs_mule.full_support = 1;
5953         }
5954       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5955       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5956     }
5957   else if (EQ (coding_type, Qshift_jis))
5958     {
5959       coding->detector = detect_coding_sjis;
5960       coding->decoder = decode_coding_sjis;
5961       coding->encoder = encode_coding_sjis;
5962       coding->common_flags
5963         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5964     }
5965   else if (EQ (coding_type, Qbig5))
5966     {
5967       coding->detector = detect_coding_big5;
5968       coding->decoder = decode_coding_big5;
5969       coding->encoder = encode_coding_big5;
5970       coding->common_flags
5971         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5972     }
5973   else                          /* EQ (coding_type, Qraw_text) */
5974     {
5975       coding->detector = NULL;
5976       coding->decoder = decode_coding_raw_text;
5977       coding->encoder = encode_coding_raw_text;
5978       if (! EQ (eol_type, Qunix))
5979         {
5980           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5981           if (! VECTORP (eol_type))
5982             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5983         }
5984
5985     }
5986
5987   return;
5988 }
5989
5990 /* Return a list of charsets supported by CODING.  */
5991
5992 Lisp_Object
5993 coding_charset_list (coding)
5994      struct coding_system *coding;
5995 {
5996   Lisp_Object attrs, charset_list;
5997
5998   CODING_GET_INFO (coding, attrs, charset_list);
5999   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6000     {
6001       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6002
6003       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6004         charset_list = Viso_2022_charset_list;
6005     }
6006   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6007     {
6008       charset_list = Vemacs_mule_charset_list;
6009     }
6010   return charset_list;
6011 }
6012
6013
6014 /* Return a list of charsets supported by CODING-SYSTEM.  */
6015
6016 Lisp_Object
6017 coding_system_charset_list (coding_system)
6018      Lisp_Object coding_system;
6019 {
6020   int id;
6021   Lisp_Object attrs, charset_list;
6022
6023   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6024   attrs = CODING_ID_ATTRS (id);
6025
6026   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6027     {
6028       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6029
6030       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6031         charset_list = Viso_2022_charset_list;
6032       else
6033         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6034     }
6035   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6036     {
6037       charset_list = Vemacs_mule_charset_list;
6038     }
6039   else
6040     {
6041       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6042     }
6043   return charset_list;
6044 }
6045
6046
6047 /* Return raw-text or one of its subsidiaries that has the same
6048    eol_type as CODING-SYSTEM.  */
6049
6050 Lisp_Object
6051 raw_text_coding_system (coding_system)
6052      Lisp_Object coding_system;
6053 {
6054   Lisp_Object spec, attrs;
6055   Lisp_Object eol_type, raw_text_eol_type;
6056
6057   if (NILP (coding_system))
6058     return Qraw_text;
6059   spec = CODING_SYSTEM_SPEC (coding_system);
6060   attrs = AREF (spec, 0);
6061
6062   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6063     return coding_system;
6064
6065   eol_type = AREF (spec, 2);
6066   if (VECTORP (eol_type))
6067     return Qraw_text;
6068   spec = CODING_SYSTEM_SPEC (Qraw_text);
6069   raw_text_eol_type = AREF (spec, 2);
6070   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6071           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6072           : AREF (raw_text_eol_type, 2));
6073 }
6074
6075
6076 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6077    does, return one of the subsidiary that has the same eol-spec as
6078    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
6079    inherit end-of-line format from the system's setting
6080    (system_eol_type).  */
6081
6082 Lisp_Object
6083 coding_inherit_eol_type (coding_system, parent)
6084      Lisp_Object coding_system, parent;
6085 {
6086   Lisp_Object spec, eol_type;
6087
6088   if (NILP (coding_system))
6089     coding_system = Qraw_text;
6090   spec = CODING_SYSTEM_SPEC (coding_system);
6091   eol_type = AREF (spec, 2);
6092   if (VECTORP (eol_type))
6093     {
6094       Lisp_Object parent_eol_type;
6095
6096       if (! NILP (parent))
6097         {
6098           Lisp_Object parent_spec;
6099
6100           parent_spec = CODING_SYSTEM_SPEC (parent);
6101           parent_eol_type = AREF (parent_spec, 2);
6102         }
6103       else
6104         parent_eol_type = system_eol_type;
6105       if (EQ (parent_eol_type, Qunix))
6106         coding_system = AREF (eol_type, 0);
6107       else if (EQ (parent_eol_type, Qdos))
6108         coding_system = AREF (eol_type, 1);
6109       else if (EQ (parent_eol_type, Qmac))
6110         coding_system = AREF (eol_type, 2);
6111     }
6112   return coding_system;
6113 }
6114
6115 /* Emacs has a mechanism to automatically detect a coding system if it
6116    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6117    it's impossible to distinguish some coding systems accurately
6118    because they use the same range of codes.  So, at first, coding
6119    systems are categorized into 7, those are:
6120
6121    o coding-category-emacs-mule
6122
6123         The category for a coding system which has the same code range
6124         as Emacs' internal format.  Assigned the coding-system (Lisp
6125         symbol) `emacs-mule' by default.
6126
6127    o coding-category-sjis
6128
6129         The category for a coding system which has the same code range
6130         as SJIS.  Assigned the coding-system (Lisp
6131         symbol) `japanese-shift-jis' by default.
6132
6133    o coding-category-iso-7
6134
6135         The category for a coding system which has the same code range
6136         as ISO2022 of 7-bit environment.  This doesn't use any locking
6137         shift and single shift functions.  This can encode/decode all
6138         charsets.  Assigned the coding-system (Lisp symbol)
6139         `iso-2022-7bit' by default.
6140
6141    o coding-category-iso-7-tight
6142
6143         Same as coding-category-iso-7 except that this can
6144         encode/decode only the specified charsets.
6145
6146    o coding-category-iso-8-1
6147
6148         The category for a coding system which has the same code range
6149         as ISO2022 of 8-bit environment and graphic plane 1 used only
6150         for DIMENSION1 charset.  This doesn't use any locking shift
6151         and single shift functions.  Assigned the coding-system (Lisp
6152         symbol) `iso-latin-1' by default.
6153
6154    o coding-category-iso-8-2
6155
6156         The category for a coding system which has the same code range
6157         as ISO2022 of 8-bit environment and graphic plane 1 used only
6158         for DIMENSION2 charset.  This doesn't use any locking shift
6159         and single shift functions.  Assigned the coding-system (Lisp
6160         symbol) `japanese-iso-8bit' by default.
6161
6162    o coding-category-iso-7-else
6163
6164         The category for a coding system which has the same code range
6165         as ISO2022 of 7-bit environemnt but uses locking shift or
6166         single shift functions.  Assigned the coding-system (Lisp
6167         symbol) `iso-2022-7bit-lock' by default.
6168
6169    o coding-category-iso-8-else
6170
6171         The category for a coding system which has the same code range
6172         as ISO2022 of 8-bit environemnt but uses locking shift or
6173         single shift functions.  Assigned the coding-system (Lisp
6174         symbol) `iso-2022-8bit-ss2' by default.
6175
6176    o coding-category-big5
6177
6178         The category for a coding system which has the same code range
6179         as BIG5.  Assigned the coding-system (Lisp symbol)
6180         `cn-big5' by default.
6181
6182    o coding-category-utf-8
6183
6184         The category for a coding system which has the same code range
6185         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6186         symbol) `utf-8' by default.
6187
6188    o coding-category-utf-16-be
6189
6190         The category for a coding system in which a text has an
6191         Unicode signature (cf. Unicode Standard) in the order of BIG
6192         endian at the head.  Assigned the coding-system (Lisp symbol)
6193         `utf-16-be' by default.
6194
6195    o coding-category-utf-16-le
6196
6197         The category for a coding system in which a text has an
6198         Unicode signature (cf. Unicode Standard) in the order of
6199         LITTLE endian at the head.  Assigned the coding-system (Lisp
6200         symbol) `utf-16-le' by default.
6201
6202    o coding-category-ccl
6203
6204         The category for a coding system of which encoder/decoder is
6205         written in CCL programs.  The default value is nil, i.e., no
6206         coding system is assigned.
6207
6208    o coding-category-binary
6209
6210         The category for a coding system not categorized in any of the
6211         above.  Assigned the coding-system (Lisp symbol)
6212         `no-conversion' by default.
6213
6214    Each of them is a Lisp symbol and the value is an actual
6215    `coding-system's (this is also a Lisp symbol) assigned by a user.
6216    What Emacs does actually is to detect a category of coding system.
6217    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6218    decide only one possible category, it selects a category of the
6219    highest priority.  Priorities of categories are also specified by a
6220    user in a Lisp variable `coding-category-list'.
6221
6222 */
6223
6224 #define EOL_SEEN_NONE   0
6225 #define EOL_SEEN_LF     1
6226 #define EOL_SEEN_CR     2
6227 #define EOL_SEEN_CRLF   4
6228
6229 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6230    SOURCE is encoded.  If CATEGORY is one of
6231    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6232    two-byte, else they are encoded by one-byte.
6233
6234    Return one of EOL_SEEN_XXX.  */
6235
6236 #define MAX_EOL_CHECK_COUNT 3
6237
6238 static int
6239 detect_eol (source, src_bytes, category)
6240      const unsigned char *source;
6241      EMACS_INT src_bytes;
6242      enum coding_category category;
6243 {
6244   const unsigned char *src = source, *src_end = src + src_bytes;
6245   unsigned char c;
6246   int total  = 0;
6247   int eol_seen = EOL_SEEN_NONE;
6248
6249   if ((1 << category) & CATEGORY_MASK_UTF_16)
6250     {
6251       int msb, lsb;
6252
6253       msb = category == (coding_category_utf_16_le
6254                          | coding_category_utf_16_le_nosig);
6255       lsb = 1 - msb;
6256
6257       while (src + 1 < src_end)
6258         {
6259           c = src[lsb];
6260           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6261             {
6262               int this_eol;
6263
6264               if (c == '\n')
6265                 this_eol = EOL_SEEN_LF;
6266               else if (src + 3 >= src_end
6267                        || src[msb + 2] != 0
6268                        || src[lsb + 2] != '\n')
6269                 this_eol = EOL_SEEN_CR;
6270               else
6271                 {
6272                   this_eol = EOL_SEEN_CRLF;
6273                   src += 2;
6274                 }
6275
6276               if (eol_seen == EOL_SEEN_NONE)
6277                 /* This is the first end-of-line.  */
6278                 eol_seen = this_eol;
6279               else if (eol_seen != this_eol)
6280                 {
6281                   /* The found type is different from what found before.
6282                      Allow for stray ^M characters in DOS EOL files.  */
6283                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6284                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6285                     eol_seen = EOL_SEEN_CRLF;
6286                   else
6287                     {
6288                       eol_seen = EOL_SEEN_LF;
6289                       break;
6290                     }
6291                 }
6292               if (++total == MAX_EOL_CHECK_COUNT)
6293                 break;
6294             }
6295           src += 2;
6296         }
6297     }
6298   else
6299     {
6300       while (src < src_end)
6301         {
6302           c = *src++;
6303           if (c == '\n' || c == '\r')
6304             {
6305               int this_eol;
6306
6307               if (c == '\n')
6308                 this_eol = EOL_SEEN_LF;
6309               else if (src >= src_end || *src != '\n')
6310                 this_eol = EOL_SEEN_CR;
6311               else
6312                 this_eol = EOL_SEEN_CRLF, src++;
6313
6314               if (eol_seen == EOL_SEEN_NONE)
6315                 /* This is the first end-of-line.  */
6316                 eol_seen = this_eol;
6317               else if (eol_seen != this_eol)
6318                 {
6319                   /* The found type is different from what found before.
6320                      Allow for stray ^M characters in DOS EOL files.  */
6321                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6322                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6323                     eol_seen = EOL_SEEN_CRLF;
6324                   else
6325                     {
6326                       eol_seen = EOL_SEEN_LF;
6327                       break;
6328                     }
6329                 }
6330               if (++total == MAX_EOL_CHECK_COUNT)
6331                 break;
6332             }
6333         }
6334     }
6335   return eol_seen;
6336 }
6337
6338
6339 static Lisp_Object
6340 adjust_coding_eol_type (coding, eol_seen)
6341      struct coding_system *coding;
6342      int eol_seen;
6343 {
6344   Lisp_Object eol_type;
6345
6346   eol_type = CODING_ID_EOL_TYPE (coding->id);
6347   if (eol_seen & EOL_SEEN_LF)
6348     {
6349       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6350       eol_type = Qunix;
6351     }
6352   else if (eol_seen & EOL_SEEN_CRLF)
6353     {
6354       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6355       eol_type = Qdos;
6356     }
6357   else if (eol_seen & EOL_SEEN_CR)
6358     {
6359       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6360       eol_type = Qmac;
6361     }
6362   return eol_type;
6363 }
6364
6365 /* Detect how a text specified in CODING is encoded.  If a coding
6366    system is detected, update fields of CODING by the detected coding
6367    system.  */
6368
6369 void
6370 detect_coding (coding)
6371      struct coding_system *coding;
6372 {
6373   const unsigned char *src, *src_end;
6374   int saved_mode = coding->mode;
6375
6376   coding->consumed = coding->consumed_char = 0;
6377   coding->produced = coding->produced_char = 0;
6378   coding_set_source (coding);
6379
6380   src_end = coding->source + coding->src_bytes;
6381   coding->head_ascii = 0;
6382
6383   /* If we have not yet decided the text encoding type, detect it
6384      now.  */
6385   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6386     {
6387       int c, i;
6388       struct coding_detection_info detect_info;
6389       int null_byte_found = 0, eight_bit_found = 0;
6390
6391       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6392       for (src = coding->source; src < src_end; src++)
6393         {
6394           c = *src;
6395           if (c & 0x80)
6396             {
6397               eight_bit_found = 1;
6398               if (null_byte_found)
6399                 break;
6400             }
6401           else if (c < 0x20)
6402             {
6403               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6404                   && ! inhibit_iso_escape_detection
6405                   && ! detect_info.checked)
6406                 {
6407                   if (detect_coding_iso_2022 (coding, &detect_info))
6408                     {
6409                       /* We have scanned the whole data.  */
6410                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6411                         {
6412                           /* We didn't find an 8-bit code.  We may
6413                              have found a null-byte, but it's very
6414                              rare that a binary file confirm to
6415                              ISO-2022.  */
6416                           src = src_end;
6417                           coding->head_ascii = src - coding->source;
6418                         }
6419                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6420                       break;
6421                     }
6422                 }
6423               else if (! c && !inhibit_null_byte_detection)
6424                 {
6425                   null_byte_found = 1;
6426                   if (eight_bit_found)
6427                     break;
6428                 }
6429               if (! eight_bit_found)
6430                 coding->head_ascii++;
6431             }
6432           else if (! eight_bit_found)
6433             coding->head_ascii++;
6434         }
6435
6436       if (null_byte_found || eight_bit_found
6437           || coding->head_ascii < coding->src_bytes
6438           || detect_info.found)
6439         {
6440           enum coding_category category;
6441           struct coding_system *this;
6442
6443           if (coding->head_ascii == coding->src_bytes)
6444             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6445             for (i = 0; i < coding_category_raw_text; i++)
6446               {
6447                 category = coding_priorities[i];
6448                 this = coding_categories + category;
6449                 if (detect_info.found & (1 << category))
6450                   break;
6451               }
6452           else
6453             {
6454               if (null_byte_found)
6455                 {
6456                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6457                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6458                 }
6459               for (i = 0; i < coding_category_raw_text; i++)
6460                 {
6461                   category = coding_priorities[i];
6462                   this = coding_categories + category;
6463                   if (this->id < 0)
6464                     {
6465                       /* No coding system of this category is defined.  */
6466                       detect_info.rejected |= (1 << category);
6467                     }
6468                   else if (category >= coding_category_raw_text)
6469                     continue;
6470                   else if (detect_info.checked & (1 << category))
6471                     {
6472                       if (detect_info.found & (1 << category))
6473                         break;
6474                     }
6475                   else if ((*(this->detector)) (coding, &detect_info)
6476                            && detect_info.found & (1 << category))
6477                     {
6478                       if (category == coding_category_utf_16_auto)
6479                         {
6480                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6481                             category = coding_category_utf_16_le;
6482                           else
6483                             category = coding_category_utf_16_be;
6484                         }
6485                       break;
6486                     }
6487                 }
6488             }
6489
6490           if (i < coding_category_raw_text)
6491             setup_coding_system (CODING_ID_NAME (this->id), coding);
6492           else if (null_byte_found)
6493             setup_coding_system (Qno_conversion, coding);
6494           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6495                    == CATEGORY_MASK_ANY)
6496             setup_coding_system (Qraw_text, coding);
6497           else if (detect_info.rejected)
6498             for (i = 0; i < coding_category_raw_text; i++)
6499               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6500                 {
6501                   this = coding_categories + coding_priorities[i];
6502                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6503                   break;
6504                 }
6505         }
6506     }
6507   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6508            == coding_category_utf_8_auto)
6509     {
6510       Lisp_Object coding_systems;
6511       struct coding_detection_info detect_info;
6512
6513       coding_systems
6514         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6515       detect_info.found = detect_info.rejected = 0;
6516       coding->head_ascii = 0;
6517       if (CONSP (coding_systems)
6518           && detect_coding_utf_8 (coding, &detect_info))
6519         {
6520           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6521             setup_coding_system (XCAR (coding_systems), coding);
6522           else
6523             setup_coding_system (XCDR (coding_systems), coding);
6524         }
6525     }
6526   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6527            == coding_category_utf_16_auto)
6528     {
6529       Lisp_Object coding_systems;
6530       struct coding_detection_info detect_info;
6531
6532       coding_systems
6533         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6534       detect_info.found = detect_info.rejected = 0;
6535       coding->head_ascii = 0;
6536       if (CONSP (coding_systems)
6537           && detect_coding_utf_16 (coding, &detect_info))
6538         {
6539           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6540             setup_coding_system (XCAR (coding_systems), coding);
6541           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6542             setup_coding_system (XCDR (coding_systems), coding);
6543         }
6544     }
6545   coding->mode = saved_mode;
6546 }
6547
6548
6549 static void
6550 decode_eol (coding)
6551      struct coding_system *coding;
6552 {
6553   Lisp_Object eol_type;
6554   unsigned char *p, *pbeg, *pend;
6555
6556   eol_type = CODING_ID_EOL_TYPE (coding->id);
6557   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6558     return;
6559
6560   if (NILP (coding->dst_object))
6561     pbeg = coding->destination;
6562   else
6563     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6564   pend = pbeg + coding->produced;
6565
6566   if (VECTORP (eol_type))
6567     {
6568       int eol_seen = EOL_SEEN_NONE;
6569
6570       for (p = pbeg; p < pend; p++)
6571         {
6572           if (*p == '\n')
6573             eol_seen |= EOL_SEEN_LF;
6574           else if (*p == '\r')
6575             {
6576               if (p + 1 < pend && *(p + 1) == '\n')
6577                 {
6578                   eol_seen |= EOL_SEEN_CRLF;
6579                   p++;
6580                 }
6581               else
6582                 eol_seen |= EOL_SEEN_CR;
6583             }
6584         }
6585       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6586       if ((eol_seen & EOL_SEEN_CRLF) != 0
6587           && (eol_seen & EOL_SEEN_CR) != 0
6588           && (eol_seen & EOL_SEEN_LF) == 0)
6589         eol_seen = EOL_SEEN_CRLF;
6590       else if (eol_seen != EOL_SEEN_NONE
6591           && eol_seen != EOL_SEEN_LF
6592           && eol_seen != EOL_SEEN_CRLF
6593           && eol_seen != EOL_SEEN_CR)
6594         eol_seen = EOL_SEEN_LF;
6595       if (eol_seen != EOL_SEEN_NONE)
6596         eol_type = adjust_coding_eol_type (coding, eol_seen);
6597     }
6598
6599   if (EQ (eol_type, Qmac))
6600     {
6601       for (p = pbeg; p < pend; p++)
6602         if (*p == '\r')
6603           *p = '\n';
6604     }
6605   else if (EQ (eol_type, Qdos))
6606     {
6607       int n = 0;
6608
6609       if (NILP (coding->dst_object))
6610         {
6611           /* Start deleting '\r' from the tail to minimize the memory
6612              movement.  */
6613           for (p = pend - 2; p >= pbeg; p--)
6614             if (*p == '\r')
6615               {
6616                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6617                 n++;
6618               }
6619         }
6620       else
6621         {
6622           int pos_byte = coding->dst_pos_byte;
6623           int pos = coding->dst_pos;
6624           int pos_end = pos + coding->produced_char - 1;
6625
6626           while (pos < pos_end)
6627             {
6628               p = BYTE_POS_ADDR (pos_byte);
6629               if (*p == '\r' && p[1] == '\n')
6630                 {
6631                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6632                   n++;
6633                   pos_end--;
6634                 }
6635               pos++;
6636               if (coding->dst_multibyte)
6637                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6638               else
6639                 pos_byte++;
6640             }
6641         }
6642       coding->produced -= n;
6643       coding->produced_char -= n;
6644     }
6645 }
6646
6647
6648 /* Return a translation table (or list of them) from coding system
6649    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6650    decoding (ENCODEP is zero). */
6651
6652 static Lisp_Object
6653 get_translation_table (attrs, encodep, max_lookup)
6654      Lisp_Object attrs;
6655      int encodep, *max_lookup;
6656 {
6657   Lisp_Object standard, translation_table;
6658   Lisp_Object val;
6659
6660   if (NILP (Venable_character_translation))
6661     {
6662       if (max_lookup)
6663         *max_lookup = 0;
6664       return Qnil;
6665     }
6666   if (encodep)
6667     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6668       standard = Vstandard_translation_table_for_encode;
6669   else
6670     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6671       standard = Vstandard_translation_table_for_decode;
6672   if (NILP (translation_table))
6673     translation_table = standard;
6674   else
6675     {
6676       if (SYMBOLP (translation_table))
6677         translation_table = Fget (translation_table, Qtranslation_table);
6678       else if (CONSP (translation_table))
6679         {
6680           translation_table = Fcopy_sequence (translation_table);
6681           for (val = translation_table; CONSP (val); val = XCDR (val))
6682             if (SYMBOLP (XCAR (val)))
6683               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6684         }
6685       if (CHAR_TABLE_P (standard))
6686         {
6687           if (CONSP (translation_table))
6688             translation_table = nconc2 (translation_table,
6689                                         Fcons (standard, Qnil));
6690           else
6691             translation_table = Fcons (translation_table,
6692                                        Fcons (standard, Qnil));
6693         }
6694     }
6695
6696   if (max_lookup)
6697     {
6698       *max_lookup = 1;
6699       if (CHAR_TABLE_P (translation_table)
6700           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6701         {
6702           val = XCHAR_TABLE (translation_table)->extras[1];
6703           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6704             *max_lookup = XFASTINT (val);
6705         }
6706       else if (CONSP (translation_table))
6707         {
6708           Lisp_Object tail, val;
6709
6710           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6711             if (CHAR_TABLE_P (XCAR (tail))
6712                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6713               {
6714                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6715                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6716                   *max_lookup = XFASTINT (val);
6717               }
6718         }
6719     }
6720   return translation_table;
6721 }
6722
6723 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6724   do {                                                          \
6725     trans = Qnil;                                               \
6726     if (CHAR_TABLE_P (table))                                   \
6727       {                                                         \
6728         trans = CHAR_TABLE_REF (table, c);                      \
6729         if (CHARACTERP (trans))                                 \
6730           c = XFASTINT (trans), trans = Qnil;                   \
6731       }                                                         \
6732     else if (CONSP (table))                                     \
6733       {                                                         \
6734         Lisp_Object tail;                                       \
6735                                                                 \
6736         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6737           if (CHAR_TABLE_P (XCAR (tail)))                       \
6738             {                                                   \
6739               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6740               if (CHARACTERP (trans))                           \
6741                 c = XFASTINT (trans), trans = Qnil;             \
6742               else if (! NILP (trans))                          \
6743                 break;                                          \
6744             }                                                   \
6745       }                                                         \
6746   } while (0)
6747
6748
6749 /* Return a translation of character(s) at BUF according to TRANS.
6750    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6751    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6752    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6753    translation is found, and Qnil if not found..
6754    If BUF is too short to lookup characters in FROM, return Qt.  */
6755
6756 static Lisp_Object
6757 get_translation (trans, buf, buf_end)
6758      Lisp_Object trans;
6759      int *buf, *buf_end;
6760 {
6761
6762   if (INTEGERP (trans))
6763     return trans;
6764   for (; CONSP (trans); trans = XCDR (trans))
6765     {
6766       Lisp_Object val = XCAR (trans);
6767       Lisp_Object from = XCAR (val);
6768       int len = ASIZE (from);
6769       int i;
6770
6771       for (i = 0; i < len; i++)
6772         {
6773           if (buf + i == buf_end)
6774             return Qt;
6775           if (XINT (AREF (from, i)) != buf[i])
6776             break;
6777         }
6778       if (i == len)
6779         return val;
6780     }
6781   return Qnil;
6782 }
6783
6784
6785 static int
6786 produce_chars (coding, translation_table, last_block)
6787      struct coding_system *coding;
6788      Lisp_Object translation_table;
6789      int last_block;
6790 {
6791   unsigned char *dst = coding->destination + coding->produced;
6792   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6793   EMACS_INT produced;
6794   EMACS_INT produced_chars = 0;
6795   int carryover = 0;
6796
6797   if (! coding->chars_at_source)
6798     {
6799       /* Source characters are in coding->charbuf.  */
6800       int *buf = coding->charbuf;
6801       int *buf_end = buf + coding->charbuf_used;
6802
6803       if (EQ (coding->src_object, coding->dst_object))
6804         {
6805           coding_set_source (coding);
6806           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6807         }
6808
6809       while (buf < buf_end)
6810         {
6811           int c = *buf, i;
6812
6813           if (c >= 0)
6814             {
6815               int from_nchars = 1, to_nchars = 1;
6816               Lisp_Object trans = Qnil;
6817
6818               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6819               if (! NILP (trans))
6820                 {
6821                   trans = get_translation (trans, buf, buf_end);
6822                   if (INTEGERP (trans))
6823                     c = XINT (trans);
6824                   else if (CONSP (trans))
6825                     {
6826                       from_nchars = ASIZE (XCAR (trans));
6827                       trans = XCDR (trans);
6828                       if (INTEGERP (trans))
6829                         c = XINT (trans);
6830                       else
6831                         {
6832                           to_nchars = ASIZE (trans);
6833                           c = XINT (AREF (trans, 0));
6834                         }
6835                     }
6836                   else if (EQ (trans, Qt) && ! last_block)
6837                     break;
6838                 }
6839
6840               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6841                 {
6842                   dst = alloc_destination (coding,
6843                                            buf_end - buf
6844                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6845                                            dst);
6846                   if (EQ (coding->src_object, coding->dst_object))
6847                     {
6848                       coding_set_source (coding);
6849                       dst_end = (((unsigned char *) coding->source)
6850                                  + coding->consumed);
6851                     }
6852                   else
6853                     dst_end = coding->destination + coding->dst_bytes;
6854                 }
6855
6856               for (i = 0; i < to_nchars; i++)
6857                 {
6858                   if (i > 0)
6859                     c = XINT (AREF (trans, i));
6860                   if (coding->dst_multibyte
6861                       || ! CHAR_BYTE8_P (c))
6862                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6863                   else
6864                     *dst++ = CHAR_TO_BYTE8 (c);
6865                 }
6866               produced_chars += to_nchars;
6867               buf += from_nchars;
6868             }
6869           else
6870             /* This is an annotation datum.  (-C) is the length.  */
6871             buf += -c;
6872         }
6873       carryover = buf_end - buf;
6874     }
6875   else
6876     {
6877       /* Source characters are at coding->source.  */
6878       const unsigned char *src = coding->source;
6879       const unsigned char *src_end = src + coding->consumed;
6880
6881       if (EQ (coding->dst_object, coding->src_object))
6882         dst_end = (unsigned char *) src;
6883       if (coding->src_multibyte != coding->dst_multibyte)
6884         {
6885           if (coding->src_multibyte)
6886             {
6887               int multibytep = 1;
6888               EMACS_INT consumed_chars = 0;
6889
6890               while (1)
6891                 {
6892                   const unsigned char *src_base = src;
6893                   int c;
6894
6895                   ONE_MORE_BYTE (c);
6896                   if (dst == dst_end)
6897                     {
6898                       if (EQ (coding->src_object, coding->dst_object))
6899                         dst_end = (unsigned char *) src;
6900                       if (dst == dst_end)
6901                         {
6902                           EMACS_INT offset = src - coding->source;
6903
6904                           dst = alloc_destination (coding, src_end - src + 1,
6905                                                    dst);
6906                           dst_end = coding->destination + coding->dst_bytes;
6907                           coding_set_source (coding);
6908                           src = coding->source + offset;
6909                           src_end = coding->source + coding->src_bytes;
6910                           if (EQ (coding->src_object, coding->dst_object))
6911                             dst_end = (unsigned char *) src;
6912                         }
6913                     }
6914                   *dst++ = c;
6915                   produced_chars++;
6916                 }
6917             no_more_source:
6918               ;
6919             }
6920           else
6921             while (src < src_end)
6922               {
6923                 int multibytep = 1;
6924                 int c = *src++;
6925
6926                 if (dst >= dst_end - 1)
6927                   {
6928                     if (EQ (coding->src_object, coding->dst_object))
6929                       dst_end = (unsigned char *) src;
6930                     if (dst >= dst_end - 1)
6931                       {
6932                         EMACS_INT offset = src - coding->source;
6933                         EMACS_INT more_bytes;
6934
6935                         if (EQ (coding->src_object, coding->dst_object))
6936                           more_bytes = ((src_end - src) / 2) + 2;
6937                         else
6938                           more_bytes = src_end - src + 2;
6939                         dst = alloc_destination (coding, more_bytes, dst);
6940                         dst_end = coding->destination + coding->dst_bytes;
6941                         coding_set_source (coding);
6942                         src = coding->source + offset;
6943                         src_end = coding->source + coding->src_bytes;
6944                         if (EQ (coding->src_object, coding->dst_object))
6945                           dst_end = (unsigned char *) src;
6946                       }
6947                   }
6948                 EMIT_ONE_BYTE (c);
6949               }
6950         }
6951       else
6952         {
6953           if (!EQ (coding->src_object, coding->dst_object))
6954             {
6955               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6956
6957               if (require > 0)
6958                 {
6959                   EMACS_INT offset = src - coding->source;
6960
6961                   dst = alloc_destination (coding, require, dst);
6962                   coding_set_source (coding);
6963                   src = coding->source + offset;
6964                   src_end = coding->source + coding->src_bytes;
6965                 }
6966             }
6967           produced_chars = coding->consumed_char;
6968           while (src < src_end)
6969             *dst++ = *src++;
6970         }
6971     }
6972
6973   produced = dst - (coding->destination + coding->produced);
6974   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6975     insert_from_gap (produced_chars, produced);
6976   coding->produced += produced;
6977   coding->produced_char += produced_chars;
6978   return carryover;
6979 }
6980
6981 /* Compose text in CODING->object according to the annotation data at
6982    CHARBUF.  CHARBUF is an array:
6983      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6984  */
6985
6986 static INLINE void
6987 produce_composition (coding, charbuf, pos)
6988      struct coding_system *coding;
6989      int *charbuf;
6990      EMACS_INT pos;
6991 {
6992   int len;
6993   EMACS_INT to;
6994   enum composition_method method;
6995   Lisp_Object components;
6996
6997   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6998   to = pos + charbuf[2];
6999   method = (enum composition_method) (charbuf[4]);
7000
7001   if (method == COMPOSITION_RELATIVE)
7002     components = Qnil;
7003   else
7004     {
7005       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7006       int i, j;
7007
7008       if (method == COMPOSITION_WITH_RULE)
7009         len = charbuf[2] * 3 - 2;
7010       charbuf += MAX_ANNOTATION_LENGTH;
7011       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7012       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7013         {
7014           if (charbuf[i] >= 0)
7015             args[j] = make_number (charbuf[i]);
7016           else
7017             {
7018               i++;
7019               args[j] = make_number (charbuf[i] % 0x100);
7020             }
7021         }
7022       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7023     }
7024   compose_text (pos, to, components, Qnil, coding->dst_object);
7025 }
7026
7027
7028 /* Put `charset' property on text in CODING->object according to
7029    the annotation data at CHARBUF.  CHARBUF is an array:
7030      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7031  */
7032
7033 static INLINE void
7034 produce_charset (coding, charbuf, pos)
7035      struct coding_system *coding;
7036      int *charbuf;
7037      EMACS_INT pos;
7038 {
7039   EMACS_INT from = pos - charbuf[2];
7040   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7041
7042   Fput_text_property (make_number (from), make_number (pos),
7043                       Qcharset, CHARSET_NAME (charset),
7044                       coding->dst_object);
7045 }
7046
7047
7048 #define CHARBUF_SIZE 0x4000
7049
7050 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7051   do {                                                                  \
7052     int size = CHARBUF_SIZE;                                            \
7053                                                                         \
7054     coding->charbuf = NULL;                                             \
7055     while (size > 1024)                                                 \
7056       {                                                                 \
7057         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7058         if (coding->charbuf)                                            \
7059           break;                                                        \
7060         size >>= 1;                                                     \
7061       }                                                                 \
7062     if (! coding->charbuf)                                              \
7063       {                                                                 \
7064         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7065         return coding->result;                                          \
7066       }                                                                 \
7067     coding->charbuf_size = size;                                        \
7068   } while (0)
7069
7070
7071 static void
7072 produce_annotation (coding, pos)
7073      struct coding_system *coding;
7074      EMACS_INT pos;
7075 {
7076   int *charbuf = coding->charbuf;
7077   int *charbuf_end = charbuf + coding->charbuf_used;
7078
7079   if (NILP (coding->dst_object))
7080     return;
7081
7082   while (charbuf < charbuf_end)
7083     {
7084       if (*charbuf >= 0)
7085         pos++, charbuf++;
7086       else
7087         {
7088           int len = -*charbuf;
7089
7090           if (len > 2)
7091             switch (charbuf[1])
7092               {
7093               case CODING_ANNOTATE_COMPOSITION_MASK:
7094                 produce_composition (coding, charbuf, pos);
7095                 break;
7096               case CODING_ANNOTATE_CHARSET_MASK:
7097                 produce_charset (coding, charbuf, pos);
7098                 break;
7099               }
7100           charbuf += len;
7101         }
7102     }
7103 }
7104
7105 /* Decode the data at CODING->src_object into CODING->dst_object.
7106    CODING->src_object is a buffer, a string, or nil.
7107    CODING->dst_object is a buffer.
7108
7109    If CODING->src_object is a buffer, it must be the current buffer.
7110    In this case, if CODING->src_pos is positive, it is a position of
7111    the source text in the buffer, otherwise, the source text is in the
7112    gap area of the buffer, and CODING->src_pos specifies the offset of
7113    the text from GPT (which must be the same as PT).  If this is the
7114    same buffer as CODING->dst_object, CODING->src_pos must be
7115    negative.
7116
7117    If CODING->src_object is a string, CODING->src_pos is an index to
7118    that string.
7119
7120    If CODING->src_object is nil, CODING->source must already point to
7121    the non-relocatable memory area.  In this case, CODING->src_pos is
7122    an offset from CODING->source.
7123
7124    The decoded data is inserted at the current point of the buffer
7125    CODING->dst_object.
7126 */
7127
7128 static int
7129 decode_coding (coding)
7130      struct coding_system *coding;
7131 {
7132   Lisp_Object attrs;
7133   Lisp_Object undo_list;
7134   Lisp_Object translation_table;
7135   struct ccl_spec cclspec;
7136   int carryover;
7137   int i;
7138
7139   if (BUFFERP (coding->src_object)
7140       && coding->src_pos > 0
7141       && coding->src_pos < GPT
7142       && coding->src_pos + coding->src_chars > GPT)
7143     move_gap_both (coding->src_pos, coding->src_pos_byte);
7144
7145   undo_list = Qt;
7146   if (BUFFERP (coding->dst_object))
7147     {
7148       if (current_buffer != XBUFFER (coding->dst_object))
7149         set_buffer_internal (XBUFFER (coding->dst_object));
7150       if (GPT != PT)
7151         move_gap_both (PT, PT_BYTE);
7152       undo_list = current_buffer->undo_list;
7153       current_buffer->undo_list = Qt;
7154     }
7155
7156   coding->consumed = coding->consumed_char = 0;
7157   coding->produced = coding->produced_char = 0;
7158   coding->chars_at_source = 0;
7159   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7160   coding->errors = 0;
7161
7162   ALLOC_CONVERSION_WORK_AREA (coding);
7163
7164   attrs = CODING_ID_ATTRS (coding->id);
7165   translation_table = get_translation_table (attrs, 0, NULL);
7166
7167   carryover = 0;
7168   if (coding->decoder == decode_coding_ccl)
7169     {
7170       coding->spec.ccl = &cclspec;
7171       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7172     }
7173   do
7174     {
7175       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7176
7177       coding_set_source (coding);
7178       coding->annotated = 0;
7179       coding->charbuf_used = carryover;
7180       (*(coding->decoder)) (coding);
7181       coding_set_destination (coding);
7182       carryover = produce_chars (coding, translation_table, 0);
7183       if (coding->annotated)
7184         produce_annotation (coding, pos);
7185       for (i = 0; i < carryover; i++)
7186         coding->charbuf[i]
7187           = coding->charbuf[coding->charbuf_used - carryover + i];
7188     }
7189   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7190          || (coding->consumed < coding->src_bytes
7191              && (coding->result == CODING_RESULT_SUCCESS
7192                  || coding->result == CODING_RESULT_INVALID_SRC)));
7193
7194   if (carryover > 0)
7195     {
7196       coding_set_destination (coding);
7197       coding->charbuf_used = carryover;
7198       produce_chars (coding, translation_table, 1);
7199     }
7200
7201   coding->carryover_bytes = 0;
7202   if (coding->consumed < coding->src_bytes)
7203     {
7204       int nbytes = coding->src_bytes - coding->consumed;
7205       const unsigned char *src;
7206
7207       coding_set_source (coding);
7208       coding_set_destination (coding);
7209       src = coding->source + coding->consumed;
7210
7211       if (coding->mode & CODING_MODE_LAST_BLOCK)
7212         {
7213           /* Flush out unprocessed data as binary chars.  We are sure
7214              that the number of data is less than the size of
7215              coding->charbuf.  */
7216           coding->charbuf_used = 0;
7217           coding->chars_at_source = 0;
7218
7219           while (nbytes-- > 0)
7220             {
7221               int c = *src++;
7222
7223               if (c & 0x80)
7224                 c = BYTE8_TO_CHAR (c);
7225               coding->charbuf[coding->charbuf_used++] = c;
7226             }
7227           produce_chars (coding, Qnil, 1);
7228         }
7229       else
7230         {
7231           /* Record unprocessed bytes in coding->carryover.  We are
7232              sure that the number of data is less than the size of
7233              coding->carryover.  */
7234           unsigned char *p = coding->carryover;
7235
7236           if (nbytes > sizeof coding->carryover)
7237             nbytes = sizeof coding->carryover;
7238           coding->carryover_bytes = nbytes;
7239           while (nbytes-- > 0)
7240             *p++ = *src++;
7241         }
7242       coding->consumed = coding->src_bytes;
7243     }
7244
7245   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7246       && !inhibit_eol_conversion)
7247     decode_eol (coding);
7248   if (BUFFERP (coding->dst_object))
7249     {
7250       current_buffer->undo_list = undo_list;
7251       record_insert (coding->dst_pos, coding->produced_char);
7252     }
7253   return coding->result;
7254 }
7255
7256
7257 /* Extract an annotation datum from a composition starting at POS and
7258    ending before LIMIT of CODING->src_object (buffer or string), store
7259    the data in BUF, set *STOP to a starting position of the next
7260    composition (if any) or to LIMIT, and return the address of the
7261    next element of BUF.
7262
7263    If such an annotation is not found, set *STOP to a starting
7264    position of a composition after POS (if any) or to LIMIT, and
7265    return BUF.  */
7266
7267 static INLINE int *
7268 handle_composition_annotation (pos, limit, coding, buf, stop)
7269      EMACS_INT pos, limit;
7270      struct coding_system *coding;
7271      int *buf;
7272      EMACS_INT *stop;
7273 {
7274   EMACS_INT start, end;
7275   Lisp_Object prop;
7276
7277   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7278       || end > limit)
7279     *stop = limit;
7280   else if (start > pos)
7281     *stop = start;
7282   else
7283     {
7284       if (start == pos)
7285         {
7286           /* We found a composition.  Store the corresponding
7287              annotation data in BUF.  */
7288           int *head = buf;
7289           enum composition_method method = COMPOSITION_METHOD (prop);
7290           int nchars = COMPOSITION_LENGTH (prop);
7291
7292           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7293           if (method != COMPOSITION_RELATIVE)
7294             {
7295               Lisp_Object components;
7296               int len, i, i_byte;
7297
7298               components = COMPOSITION_COMPONENTS (prop);
7299               if (VECTORP (components))
7300                 {
7301                   len = XVECTOR (components)->size;
7302                   for (i = 0; i < len; i++)
7303                     *buf++ = XINT (AREF (components, i));
7304                 }
7305               else if (STRINGP (components))
7306                 {
7307                   len = SCHARS (components);
7308                   i = i_byte = 0;
7309                   while (i < len)
7310                     {
7311                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7312                       buf++;
7313                     }
7314                 }
7315               else if (INTEGERP (components))
7316                 {
7317                   len = 1;
7318                   *buf++ = XINT (components);
7319                 }
7320               else if (CONSP (components))
7321                 {
7322                   for (len = 0; CONSP (components);
7323                        len++, components = XCDR (components))
7324                     *buf++ = XINT (XCAR (components));
7325                 }
7326               else
7327                 abort ();
7328               *head -= len;
7329             }
7330         }
7331
7332       if (find_composition (end, limit, &start, &end, &prop,
7333                             coding->src_object)
7334           && end <= limit)
7335         *stop = start;
7336       else
7337         *stop = limit;
7338     }
7339   return buf;
7340 }
7341
7342
7343 /* Extract an annotation datum from a text property `charset' at POS of
7344    CODING->src_object (buffer of string), store the data in BUF, set
7345    *STOP to the position where the value of `charset' property changes
7346    (limiting by LIMIT), and return the address of the next element of
7347    BUF.
7348
7349    If the property value is nil, set *STOP to the position where the
7350    property value is non-nil (limiting by LIMIT), and return BUF.  */
7351
7352 static INLINE int *
7353 handle_charset_annotation (pos, limit, coding, buf, stop)
7354      EMACS_INT pos, limit;
7355      struct coding_system *coding;
7356      int *buf;
7357      EMACS_INT *stop;
7358 {
7359   Lisp_Object val, next;
7360   int id;
7361
7362   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7363   if (! NILP (val) && CHARSETP (val))
7364     id = XINT (CHARSET_SYMBOL_ID (val));
7365   else
7366     id = -1;
7367   ADD_CHARSET_DATA (buf, 0, id);
7368   next = Fnext_single_property_change (make_number (pos), Qcharset,
7369                                        coding->src_object,
7370                                        make_number (limit));
7371   *stop = XINT (next);
7372   return buf;
7373 }
7374
7375
7376 static void
7377 consume_chars (coding, translation_table, max_lookup)
7378      struct coding_system *coding;
7379      Lisp_Object translation_table;
7380      int max_lookup;
7381 {
7382   int *buf = coding->charbuf;
7383   int *buf_end = coding->charbuf + coding->charbuf_size;
7384   const unsigned char *src = coding->source + coding->consumed;
7385   const unsigned char *src_end = coding->source + coding->src_bytes;
7386   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7387   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7388   int multibytep = coding->src_multibyte;
7389   Lisp_Object eol_type;
7390   int c;
7391   EMACS_INT stop, stop_composition, stop_charset;
7392   int *lookup_buf = NULL;
7393
7394   if (! NILP (translation_table))
7395     lookup_buf = alloca (sizeof (int) * max_lookup);
7396
7397   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7398   if (VECTORP (eol_type))
7399     eol_type = Qunix;
7400
7401   /* Note: composition handling is not yet implemented.  */
7402   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7403
7404   if (NILP (coding->src_object))
7405     stop = stop_composition = stop_charset = end_pos;
7406   else
7407     {
7408       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7409         stop = stop_composition = pos;
7410       else
7411         stop = stop_composition = end_pos;
7412       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7413         stop = stop_charset = pos;
7414       else
7415         stop_charset = end_pos;
7416     }
7417
7418   /* Compensate for CRLF and conversion.  */
7419   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7420   while (buf < buf_end)
7421     {
7422       Lisp_Object trans;
7423
7424       if (pos == stop)
7425         {
7426           if (pos == end_pos)
7427             break;
7428           if (pos == stop_composition)
7429             buf = handle_composition_annotation (pos, end_pos, coding,
7430                                                  buf, &stop_composition);
7431           if (pos == stop_charset)
7432             buf = handle_charset_annotation (pos, end_pos, coding,
7433                                              buf, &stop_charset);
7434           stop = (stop_composition < stop_charset
7435                   ? stop_composition : stop_charset);
7436         }
7437
7438       if (! multibytep)
7439         {
7440           EMACS_INT bytes;
7441
7442           if (coding->encoder == encode_coding_raw_text
7443               || coding->encoder == encode_coding_ccl)
7444             c = *src++, pos++;
7445           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7446             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7447           else
7448             c = BYTE8_TO_CHAR (*src), src++, pos++;
7449         }
7450       else
7451         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7452       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7453         c = '\n';
7454       if (! EQ (eol_type, Qunix))
7455         {
7456           if (c == '\n')
7457             {
7458               if (EQ (eol_type, Qdos))
7459                 *buf++ = '\r';
7460               else
7461                 c = '\r';
7462             }
7463         }
7464
7465       trans = Qnil;
7466       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7467       if (NILP (trans))
7468         *buf++ = c;
7469       else
7470         {
7471           int from_nchars = 1, to_nchars = 1;
7472           int *lookup_buf_end;
7473           const unsigned char *p = src;
7474           int i;
7475
7476           lookup_buf[0] = c;
7477           for (i = 1; i < max_lookup && p < src_end; i++)
7478             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7479           lookup_buf_end = lookup_buf + i;
7480           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7481           if (INTEGERP (trans))
7482             c = XINT (trans);
7483           else if (CONSP (trans))
7484             {
7485               from_nchars = ASIZE (XCAR (trans));
7486               trans = XCDR (trans);
7487               if (INTEGERP (trans))
7488                 c = XINT (trans);
7489               else
7490                 {
7491                   to_nchars = ASIZE (trans);
7492                   if (buf + to_nchars > buf_end)
7493                     break;
7494                   c = XINT (AREF (trans, 0));
7495                 }
7496             }
7497           else
7498             break;
7499           *buf++ = c;
7500           for (i = 1; i < to_nchars; i++)
7501             *buf++ = XINT (AREF (trans, i));
7502           for (i = 1; i < from_nchars; i++, pos++)
7503             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7504         }
7505     }
7506
7507   coding->consumed = src - coding->source;
7508   coding->consumed_char = pos - coding->src_pos;
7509   coding->charbuf_used = buf - coding->charbuf;
7510   coding->chars_at_source = 0;
7511 }
7512
7513
7514 /* Encode the text at CODING->src_object into CODING->dst_object.
7515    CODING->src_object is a buffer or a string.
7516    CODING->dst_object is a buffer or nil.
7517
7518    If CODING->src_object is a buffer, it must be the current buffer.
7519    In this case, if CODING->src_pos is positive, it is a position of
7520    the source text in the buffer, otherwise. the source text is in the
7521    gap area of the buffer, and coding->src_pos specifies the offset of
7522    the text from GPT (which must be the same as PT).  If this is the
7523    same buffer as CODING->dst_object, CODING->src_pos must be
7524    negative and CODING should not have `pre-write-conversion'.
7525
7526    If CODING->src_object is a string, CODING should not have
7527    `pre-write-conversion'.
7528
7529    If CODING->dst_object is a buffer, the encoded data is inserted at
7530    the current point of that buffer.
7531
7532    If CODING->dst_object is nil, the encoded data is placed at the
7533    memory area specified by CODING->destination.  */
7534
7535 static int
7536 encode_coding (coding)
7537      struct coding_system *coding;
7538 {
7539   Lisp_Object attrs;
7540   Lisp_Object translation_table;
7541   int max_lookup;
7542   struct ccl_spec cclspec;
7543
7544   attrs = CODING_ID_ATTRS (coding->id);
7545   if (coding->encoder == encode_coding_raw_text)
7546     translation_table = Qnil, max_lookup = 0;
7547   else
7548     translation_table = get_translation_table (attrs, 1, &max_lookup);
7549
7550   if (BUFFERP (coding->dst_object))
7551     {
7552       set_buffer_internal (XBUFFER (coding->dst_object));
7553       coding->dst_multibyte
7554         = ! NILP (current_buffer->enable_multibyte_characters);
7555     }
7556
7557   coding->consumed = coding->consumed_char = 0;
7558   coding->produced = coding->produced_char = 0;
7559   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7560   coding->errors = 0;
7561
7562   ALLOC_CONVERSION_WORK_AREA (coding);
7563
7564   if (coding->encoder == encode_coding_ccl)
7565     {
7566       coding->spec.ccl = &cclspec;
7567       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7568     }
7569   do {
7570     coding_set_source (coding);
7571     consume_chars (coding, translation_table, max_lookup);
7572     coding_set_destination (coding);
7573     (*(coding->encoder)) (coding);
7574   } while (coding->consumed_char < coding->src_chars);
7575
7576   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7577     insert_from_gap (coding->produced_char, coding->produced);
7578
7579   return (coding->result);
7580 }
7581
7582
7583 /* Name (or base name) of work buffer for code conversion.  */
7584 static Lisp_Object Vcode_conversion_workbuf_name;
7585
7586 /* A working buffer used by the top level conversion.  Once it is
7587    created, it is never destroyed.  It has the name
7588    Vcode_conversion_workbuf_name.  The other working buffers are
7589    destroyed after the use is finished, and their names are modified
7590    versions of Vcode_conversion_workbuf_name.  */
7591 static Lisp_Object Vcode_conversion_reused_workbuf;
7592
7593 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7594 static int reused_workbuf_in_use;
7595
7596
7597 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7598    multibyteness of returning buffer.  */
7599
7600 static Lisp_Object
7601 make_conversion_work_buffer (multibyte)
7602      int multibyte;
7603 {
7604   Lisp_Object name, workbuf;
7605   struct buffer *current;
7606
7607   if (reused_workbuf_in_use++)
7608     {
7609       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7610       workbuf = Fget_buffer_create (name);
7611     }
7612   else
7613     {
7614       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7615         Vcode_conversion_reused_workbuf
7616           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7617       workbuf = Vcode_conversion_reused_workbuf;
7618     }
7619   current = current_buffer;
7620   set_buffer_internal (XBUFFER (workbuf));
7621   /* We can't allow modification hooks to run in the work buffer.  For
7622      instance, directory_files_internal assumes that file decoding
7623      doesn't compile new regexps.  */
7624   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7625   Ferase_buffer ();
7626   current_buffer->undo_list = Qt;
7627   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7628   set_buffer_internal (current);
7629   return workbuf;
7630 }
7631
7632
7633 static Lisp_Object
7634 code_conversion_restore (arg)
7635      Lisp_Object arg;
7636 {
7637   Lisp_Object current, workbuf;
7638   struct gcpro gcpro1;
7639
7640   GCPRO1 (arg);
7641   current = XCAR (arg);
7642   workbuf = XCDR (arg);
7643   if (! NILP (workbuf))
7644     {
7645       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7646         reused_workbuf_in_use = 0;
7647       else if (! NILP (Fbuffer_live_p (workbuf)))
7648         Fkill_buffer (workbuf);
7649     }
7650   set_buffer_internal (XBUFFER (current));
7651   UNGCPRO;
7652   return Qnil;
7653 }
7654
7655 Lisp_Object
7656 code_conversion_save (with_work_buf, multibyte)
7657      int with_work_buf, multibyte;
7658 {
7659   Lisp_Object workbuf = Qnil;
7660
7661   if (with_work_buf)
7662     workbuf = make_conversion_work_buffer (multibyte);
7663   record_unwind_protect (code_conversion_restore,
7664                          Fcons (Fcurrent_buffer (), workbuf));
7665   return workbuf;
7666 }
7667
7668 int
7669 decode_coding_gap (coding, chars, bytes)
7670      struct coding_system *coding;
7671      EMACS_INT chars, bytes;
7672 {
7673   int count = specpdl_ptr - specpdl;
7674   Lisp_Object attrs;
7675
7676   code_conversion_save (0, 0);
7677
7678   coding->src_object = Fcurrent_buffer ();
7679   coding->src_chars = chars;
7680   coding->src_bytes = bytes;
7681   coding->src_pos = -chars;
7682   coding->src_pos_byte = -bytes;
7683   coding->src_multibyte = chars < bytes;
7684   coding->dst_object = coding->src_object;
7685   coding->dst_pos = PT;
7686   coding->dst_pos_byte = PT_BYTE;
7687   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7688
7689   if (CODING_REQUIRE_DETECTION (coding))
7690     detect_coding (coding);
7691
7692   coding->mode |= CODING_MODE_LAST_BLOCK;
7693   current_buffer->text->inhibit_shrinking = 1;
7694   decode_coding (coding);
7695   current_buffer->text->inhibit_shrinking = 0;
7696
7697   attrs = CODING_ID_ATTRS (coding->id);
7698   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7699     {
7700       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7701       Lisp_Object val;
7702
7703       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7704       val = call1 (CODING_ATTR_POST_READ (attrs),
7705                    make_number (coding->produced_char));
7706       CHECK_NATNUM (val);
7707       coding->produced_char += Z - prev_Z;
7708       coding->produced += Z_BYTE - prev_Z_BYTE;
7709     }
7710
7711   unbind_to (count, Qnil);
7712   return coding->result;
7713 }
7714
7715 int
7716 encode_coding_gap (coding, chars, bytes)
7717      struct coding_system *coding;
7718      EMACS_INT chars, bytes;
7719 {
7720   int count = specpdl_ptr - specpdl;
7721
7722   code_conversion_save (0, 0);
7723
7724   coding->src_object = Fcurrent_buffer ();
7725   coding->src_chars = chars;
7726   coding->src_bytes = bytes;
7727   coding->src_pos = -chars;
7728   coding->src_pos_byte = -bytes;
7729   coding->src_multibyte = chars < bytes;
7730   coding->dst_object = coding->src_object;
7731   coding->dst_pos = PT;
7732   coding->dst_pos_byte = PT_BYTE;
7733
7734   encode_coding (coding);
7735
7736   unbind_to (count, Qnil);
7737   return coding->result;
7738 }
7739
7740
7741 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7742    SRC_OBJECT into DST_OBJECT by coding context CODING.
7743
7744    SRC_OBJECT is a buffer, a string, or Qnil.
7745
7746    If it is a buffer, the text is at point of the buffer.  FROM and TO
7747    are positions in the buffer.
7748
7749    If it is a string, the text is at the beginning of the string.
7750    FROM and TO are indices to the string.
7751
7752    If it is nil, the text is at coding->source.  FROM and TO are
7753    indices to coding->source.
7754
7755    DST_OBJECT is a buffer, Qt, or Qnil.
7756
7757    If it is a buffer, the decoded text is inserted at point of the
7758    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7759    is deleted.
7760
7761    If it is Qt, a string is made from the decoded text, and
7762    set in CODING->dst_object.
7763
7764    If it is Qnil, the decoded text is stored at CODING->destination.
7765    The caller must allocate CODING->dst_bytes bytes at
7766    CODING->destination by xmalloc.  If the decoded text is longer than
7767    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7768  */
7769
7770 void
7771 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7772                       dst_object)
7773      struct coding_system *coding;
7774      Lisp_Object src_object;
7775      EMACS_INT from, from_byte, to, to_byte;
7776      Lisp_Object dst_object;
7777 {
7778   int count = specpdl_ptr - specpdl;
7779   unsigned char *destination;
7780   EMACS_INT dst_bytes;
7781   EMACS_INT chars = to - from;
7782   EMACS_INT bytes = to_byte - from_byte;
7783   Lisp_Object attrs;
7784   int saved_pt = -1, saved_pt_byte;
7785   int need_marker_adjustment = 0;
7786   Lisp_Object old_deactivate_mark;
7787
7788   old_deactivate_mark = Vdeactivate_mark;
7789
7790   if (NILP (dst_object))
7791     {
7792       destination = coding->destination;
7793       dst_bytes = coding->dst_bytes;
7794     }
7795
7796   coding->src_object = src_object;
7797   coding->src_chars = chars;
7798   coding->src_bytes = bytes;
7799   coding->src_multibyte = chars < bytes;
7800
7801   if (STRINGP (src_object))
7802     {
7803       coding->src_pos = from;
7804       coding->src_pos_byte = from_byte;
7805     }
7806   else if (BUFFERP (src_object))
7807     {
7808       set_buffer_internal (XBUFFER (src_object));
7809       if (from != GPT)
7810         move_gap_both (from, from_byte);
7811       if (EQ (src_object, dst_object))
7812         {
7813           struct Lisp_Marker *tail;
7814
7815           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7816             {
7817               tail->need_adjustment
7818                 = tail->charpos == (tail->insertion_type ? from : to);
7819               need_marker_adjustment |= tail->need_adjustment;
7820             }
7821           saved_pt = PT, saved_pt_byte = PT_BYTE;
7822           TEMP_SET_PT_BOTH (from, from_byte);
7823           current_buffer->text->inhibit_shrinking = 1;
7824           del_range_both (from, from_byte, to, to_byte, 1);
7825           coding->src_pos = -chars;
7826           coding->src_pos_byte = -bytes;
7827         }
7828       else
7829         {
7830           coding->src_pos = from;
7831           coding->src_pos_byte = from_byte;
7832         }
7833     }
7834
7835   if (CODING_REQUIRE_DETECTION (coding))
7836     detect_coding (coding);
7837   attrs = CODING_ID_ATTRS (coding->id);
7838
7839   if (EQ (dst_object, Qt)
7840       || (! NILP (CODING_ATTR_POST_READ (attrs))
7841           && NILP (dst_object)))
7842     {
7843       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7844       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7845       coding->dst_pos = BEG;
7846       coding->dst_pos_byte = BEG_BYTE;
7847     }
7848   else if (BUFFERP (dst_object))
7849     {
7850       code_conversion_save (0, 0);
7851       coding->dst_object = dst_object;
7852       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7853       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7854       coding->dst_multibyte
7855         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7856     }
7857   else
7858     {
7859       code_conversion_save (0, 0);
7860       coding->dst_object = Qnil;
7861       /* Most callers presume this will return a multibyte result, and they
7862          won't use `binary' or `raw-text' anyway, so let's not worry about
7863          CODING_FOR_UNIBYTE.  */
7864       coding->dst_multibyte = 1;
7865     }
7866
7867   decode_coding (coding);
7868
7869   if (BUFFERP (coding->dst_object))
7870     set_buffer_internal (XBUFFER (coding->dst_object));
7871
7872   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7873     {
7874       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7875       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7876       Lisp_Object val;
7877
7878       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7879       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7880               old_deactivate_mark);
7881       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7882                         make_number (coding->produced_char));
7883       UNGCPRO;
7884       CHECK_NATNUM (val);
7885       coding->produced_char += Z - prev_Z;
7886       coding->produced += Z_BYTE - prev_Z_BYTE;
7887     }
7888
7889   if (EQ (dst_object, Qt))
7890     {
7891       coding->dst_object = Fbuffer_string ();
7892     }
7893   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7894     {
7895       set_buffer_internal (XBUFFER (coding->dst_object));
7896       if (dst_bytes < coding->produced)
7897         {
7898           destination = xrealloc (destination, coding->produced);
7899           if (! destination)
7900             {
7901               record_conversion_result (coding,
7902                                         CODING_RESULT_INSUFFICIENT_MEM);
7903               unbind_to (count, Qnil);
7904               return;
7905             }
7906           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7907             move_gap_both (BEGV, BEGV_BYTE);
7908           bcopy (BEGV_ADDR, destination, coding->produced);
7909           coding->destination = destination;
7910         }
7911     }
7912
7913   if (saved_pt >= 0)
7914     {
7915       /* This is the case of:
7916          (BUFFERP (src_object) && EQ (src_object, dst_object))
7917          As we have moved PT while replacing the original buffer
7918          contents, we must recover it now.  */
7919       set_buffer_internal (XBUFFER (src_object));
7920       current_buffer->text->inhibit_shrinking = 0;
7921       if (saved_pt < from)
7922         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7923       else if (saved_pt < from + chars)
7924         TEMP_SET_PT_BOTH (from, from_byte);
7925       else if (! NILP (current_buffer->enable_multibyte_characters))
7926         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7927                           saved_pt_byte + (coding->produced - bytes));
7928       else
7929         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7930                           saved_pt_byte + (coding->produced - bytes));
7931
7932       if (need_marker_adjustment)
7933         {
7934           struct Lisp_Marker *tail;
7935
7936           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7937             if (tail->need_adjustment)
7938               {
7939                 tail->need_adjustment = 0;
7940                 if (tail->insertion_type)
7941                   {
7942                     tail->bytepos = from_byte;
7943                     tail->charpos = from;
7944                   }
7945                 else
7946                   {
7947                     tail->bytepos = from_byte + coding->produced;
7948                     tail->charpos
7949                       = (NILP (current_buffer->enable_multibyte_characters)
7950                          ? tail->bytepos : from + coding->produced_char);
7951                   }
7952               }
7953         }
7954     }
7955
7956   Vdeactivate_mark = old_deactivate_mark;
7957   unbind_to (count, coding->dst_object);
7958 }
7959
7960
7961 void
7962 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7963                       dst_object)
7964      struct coding_system *coding;
7965      Lisp_Object src_object;
7966      EMACS_INT from, from_byte, to, to_byte;
7967      Lisp_Object dst_object;
7968 {
7969   int count = specpdl_ptr - specpdl;
7970   EMACS_INT chars = to - from;
7971   EMACS_INT bytes = to_byte - from_byte;
7972   Lisp_Object attrs;
7973   int saved_pt = -1, saved_pt_byte;
7974   int need_marker_adjustment = 0;
7975   int kill_src_buffer = 0;
7976   Lisp_Object old_deactivate_mark;
7977
7978   old_deactivate_mark = Vdeactivate_mark;
7979
7980   coding->src_object = src_object;
7981   coding->src_chars = chars;
7982   coding->src_bytes = bytes;
7983   coding->src_multibyte = chars < bytes;
7984
7985   attrs = CODING_ID_ATTRS (coding->id);
7986
7987   if (EQ (src_object, dst_object))
7988     {
7989       struct Lisp_Marker *tail;
7990
7991       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7992         {
7993           tail->need_adjustment
7994             = tail->charpos == (tail->insertion_type ? from : to);
7995           need_marker_adjustment |= tail->need_adjustment;
7996         }
7997     }
7998
7999   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8000     {
8001       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8002       set_buffer_internal (XBUFFER (coding->src_object));
8003       if (STRINGP (src_object))
8004         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8005       else if (BUFFERP (src_object))
8006         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8007       else
8008         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
8009
8010       if (EQ (src_object, dst_object))
8011         {
8012           set_buffer_internal (XBUFFER (src_object));
8013           saved_pt = PT, saved_pt_byte = PT_BYTE;
8014           del_range_both (from, from_byte, to, to_byte, 1);
8015           set_buffer_internal (XBUFFER (coding->src_object));
8016         }
8017
8018       {
8019         Lisp_Object args[3];
8020         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8021
8022         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8023                 old_deactivate_mark);
8024         args[0] = CODING_ATTR_PRE_WRITE (attrs);
8025         args[1] = make_number (BEG);
8026         args[2] = make_number (Z);
8027         safe_call (3, args);
8028         UNGCPRO;
8029       }
8030       if (XBUFFER (coding->src_object) != current_buffer)
8031         kill_src_buffer = 1;
8032       coding->src_object = Fcurrent_buffer ();
8033       if (BEG != GPT)
8034         move_gap_both (BEG, BEG_BYTE);
8035       coding->src_chars = Z - BEG;
8036       coding->src_bytes = Z_BYTE - BEG_BYTE;
8037       coding->src_pos = BEG;
8038       coding->src_pos_byte = BEG_BYTE;
8039       coding->src_multibyte = Z < Z_BYTE;
8040     }
8041   else if (STRINGP (src_object))
8042     {
8043       code_conversion_save (0, 0);
8044       coding->src_pos = from;
8045       coding->src_pos_byte = from_byte;
8046     }
8047   else if (BUFFERP (src_object))
8048     {
8049       code_conversion_save (0, 0);
8050       set_buffer_internal (XBUFFER (src_object));
8051       if (EQ (src_object, dst_object))
8052         {
8053           saved_pt = PT, saved_pt_byte = PT_BYTE;
8054           coding->src_object = del_range_1 (from, to, 1, 1);
8055           coding->src_pos = 0;
8056           coding->src_pos_byte = 0;
8057         }
8058       else
8059         {
8060           if (from < GPT && to >= GPT)
8061             move_gap_both (from, from_byte);
8062           coding->src_pos = from;
8063           coding->src_pos_byte = from_byte;
8064         }
8065     }
8066   else
8067     code_conversion_save (0, 0);
8068
8069   if (BUFFERP (dst_object))
8070     {
8071       coding->dst_object = dst_object;
8072       if (EQ (src_object, dst_object))
8073         {
8074           coding->dst_pos = from;
8075           coding->dst_pos_byte = from_byte;
8076         }
8077       else
8078         {
8079           struct buffer *current = current_buffer;
8080
8081           set_buffer_temp (XBUFFER (dst_object));
8082           coding->dst_pos = PT;
8083           coding->dst_pos_byte = PT_BYTE;
8084           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8085           set_buffer_temp (current);
8086         }
8087       coding->dst_multibyte
8088         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8089     }
8090   else if (EQ (dst_object, Qt))
8091     {
8092       coding->dst_object = Qnil;
8093       coding->dst_bytes = coding->src_chars;
8094       if (coding->dst_bytes == 0)
8095         coding->dst_bytes = 1;
8096       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8097       coding->dst_multibyte = 0;
8098     }
8099   else
8100     {
8101       coding->dst_object = Qnil;
8102       coding->dst_multibyte = 0;
8103     }
8104
8105   encode_coding (coding);
8106
8107   if (EQ (dst_object, Qt))
8108     {
8109       if (BUFFERP (coding->dst_object))
8110         coding->dst_object = Fbuffer_string ();
8111       else
8112         {
8113           coding->dst_object
8114             = make_unibyte_string ((char *) coding->destination,
8115                                    coding->produced);
8116           xfree (coding->destination);
8117         }
8118     }
8119
8120   if (saved_pt >= 0)
8121     {
8122       /* This is the case of:
8123          (BUFFERP (src_object) && EQ (src_object, dst_object))
8124          As we have moved PT while replacing the original buffer
8125          contents, we must recover it now.  */
8126       set_buffer_internal (XBUFFER (src_object));
8127       if (saved_pt < from)
8128         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8129       else if (saved_pt < from + chars)
8130         TEMP_SET_PT_BOTH (from, from_byte);
8131       else if (! NILP (current_buffer->enable_multibyte_characters))
8132         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8133                           saved_pt_byte + (coding->produced - bytes));
8134       else
8135         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8136                           saved_pt_byte + (coding->produced - bytes));
8137
8138       if (need_marker_adjustment)
8139         {
8140           struct Lisp_Marker *tail;
8141
8142           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8143             if (tail->need_adjustment)
8144               {
8145                 tail->need_adjustment = 0;
8146                 if (tail->insertion_type)
8147                   {
8148                     tail->bytepos = from_byte;
8149                     tail->charpos = from;
8150                   }
8151                 else
8152                   {
8153                     tail->bytepos = from_byte + coding->produced;
8154                     tail->charpos
8155                       = (NILP (current_buffer->enable_multibyte_characters)
8156                          ? tail->bytepos : from + coding->produced_char);
8157                   }
8158               }
8159         }
8160     }
8161
8162   if (kill_src_buffer)
8163     Fkill_buffer (coding->src_object);
8164
8165   Vdeactivate_mark = old_deactivate_mark;
8166   unbind_to (count, Qnil);
8167 }
8168
8169
8170 Lisp_Object
8171 preferred_coding_system ()
8172 {
8173   int id = coding_categories[coding_priorities[0]].id;
8174
8175   return CODING_ID_NAME (id);
8176 }
8177
8178 \f
8179 #ifdef emacs
8180 /*** 8. Emacs Lisp library functions ***/
8181
8182 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8183        doc: /* Return t if OBJECT is nil or a coding-system.
8184 See the documentation of `define-coding-system' for information
8185 about coding-system objects.  */)
8186      (object)
8187      Lisp_Object object;
8188 {
8189   if (NILP (object)
8190       || CODING_SYSTEM_ID (object) >= 0)
8191     return Qt;
8192   if (! SYMBOLP (object)
8193       || NILP (Fget (object, Qcoding_system_define_form)))
8194     return Qnil;
8195   return Qt;
8196 }
8197
8198 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8199        Sread_non_nil_coding_system, 1, 1, 0,
8200        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8201      (prompt)
8202      Lisp_Object prompt;
8203 {
8204   Lisp_Object val;
8205   do
8206     {
8207       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8208                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8209     }
8210   while (SCHARS (val) == 0);
8211   return (Fintern (val, Qnil));
8212 }
8213
8214 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8215        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8216 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8217 Ignores case when completing coding systems (all Emacs coding systems
8218 are lower-case).  */)
8219      (prompt, default_coding_system)
8220      Lisp_Object prompt, default_coding_system;
8221 {
8222   Lisp_Object val;
8223   int count = SPECPDL_INDEX ();
8224
8225   if (SYMBOLP (default_coding_system))
8226     default_coding_system = SYMBOL_NAME (default_coding_system);
8227   specbind (Qcompletion_ignore_case, Qt);
8228   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8229                           Qt, Qnil, Qcoding_system_history,
8230                           default_coding_system, Qnil);
8231   unbind_to (count, Qnil);
8232   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8233 }
8234
8235 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8236        1, 1, 0,
8237        doc: /* Check validity of CODING-SYSTEM.
8238 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8239 It is valid if it is nil or a symbol defined as a coding system by the
8240 function `define-coding-system'.  */)
8241   (coding_system)
8242      Lisp_Object coding_system;
8243 {
8244   Lisp_Object define_form;
8245
8246   define_form = Fget (coding_system, Qcoding_system_define_form);
8247   if (! NILP (define_form))
8248     {
8249       Fput (coding_system, Qcoding_system_define_form, Qnil);
8250       safe_eval (define_form);
8251     }
8252   if (!NILP (Fcoding_system_p (coding_system)))
8253     return coding_system;
8254   xsignal1 (Qcoding_system_error, coding_system);
8255 }
8256
8257 \f
8258 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8259    HIGHEST is nonzero, return the coding system of the highest
8260    priority among the detected coding systems.  Otherwize return a
8261    list of detected coding systems sorted by their priorities.  If
8262    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8263    multibyte form but contains only ASCII and eight-bit chars.
8264    Otherwise, the bytes are raw bytes.
8265
8266    CODING-SYSTEM controls the detection as below:
8267
8268    If it is nil, detect both text-format and eol-format.  If the
8269    text-format part of CODING-SYSTEM is already specified
8270    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8271    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8272    detect only text-format.  */
8273
8274 Lisp_Object
8275 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8276                       coding_system)
8277      const unsigned char *src;
8278      EMACS_INT src_chars, src_bytes;
8279      int highest;
8280      int multibytep;
8281      Lisp_Object coding_system;
8282 {
8283   const unsigned char *src_end = src + src_bytes;
8284   Lisp_Object attrs, eol_type;
8285   Lisp_Object val = Qnil;
8286   struct coding_system coding;
8287   int id;
8288   struct coding_detection_info detect_info;
8289   enum coding_category base_category;
8290   int null_byte_found = 0, eight_bit_found = 0;
8291
8292   if (NILP (coding_system))
8293     coding_system = Qundecided;
8294   setup_coding_system (coding_system, &coding);
8295   attrs = CODING_ID_ATTRS (coding.id);
8296   eol_type = CODING_ID_EOL_TYPE (coding.id);
8297   coding_system = CODING_ATTR_BASE_NAME (attrs);
8298
8299   coding.source = src;
8300   coding.src_chars = src_chars;
8301   coding.src_bytes = src_bytes;
8302   coding.src_multibyte = multibytep;
8303   coding.consumed = 0;
8304   coding.mode |= CODING_MODE_LAST_BLOCK;
8305   coding.head_ascii = 0;
8306
8307   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8308
8309   /* At first, detect text-format if necessary.  */
8310   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8311   if (base_category == coding_category_undecided)
8312     {
8313       enum coding_category category;
8314       struct coding_system *this;
8315       int c, i;
8316
8317       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8318       for (; src < src_end; src++)
8319         {
8320           c = *src;
8321           if (c & 0x80)
8322             {
8323               eight_bit_found = 1;
8324               if (null_byte_found)
8325                 break;
8326             }
8327           else if (c < 0x20)
8328             {
8329               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8330                   && ! inhibit_iso_escape_detection
8331                   && ! detect_info.checked)
8332                 {
8333                   if (detect_coding_iso_2022 (&coding, &detect_info))
8334                     {
8335                       /* We have scanned the whole data.  */
8336                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8337                         {
8338                           /* We didn't find an 8-bit code.  We may
8339                              have found a null-byte, but it's very
8340                              rare that a binary file confirm to
8341                              ISO-2022.  */
8342                           src = src_end;
8343                           coding.head_ascii = src - coding.source;
8344                         }
8345                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8346                       break;
8347                     }
8348                 }
8349               else if (! c && !inhibit_null_byte_detection)
8350                 {
8351                   null_byte_found = 1;
8352                   if (eight_bit_found)
8353                     break;
8354                 }
8355               if (! eight_bit_found)
8356                 coding.head_ascii++;
8357             }
8358           else if (! eight_bit_found)
8359             coding.head_ascii++;
8360         }
8361
8362       if (null_byte_found || eight_bit_found
8363           || coding.head_ascii < coding.src_bytes
8364           || detect_info.found)
8365         {
8366           if (coding.head_ascii == coding.src_bytes)
8367             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8368             for (i = 0; i < coding_category_raw_text; i++)
8369               {
8370                 category = coding_priorities[i];
8371                 this = coding_categories + category;
8372                 if (detect_info.found & (1 << category))
8373                   break;
8374               }
8375           else
8376             {
8377               if (null_byte_found)
8378                 {
8379                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8380                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8381                 }
8382               for (i = 0; i < coding_category_raw_text; i++)
8383                 {
8384                   category = coding_priorities[i];
8385                   this = coding_categories + category;
8386
8387                   if (this->id < 0)
8388                     {
8389                       /* No coding system of this category is defined.  */
8390                       detect_info.rejected |= (1 << category);
8391                     }
8392                   else if (category >= coding_category_raw_text)
8393                     continue;
8394                   else if (detect_info.checked & (1 << category))
8395                     {
8396                       if (highest
8397                           && (detect_info.found & (1 << category)))
8398                         break;
8399                     }
8400                   else if ((*(this->detector)) (&coding, &detect_info)
8401                            && highest
8402                            && (detect_info.found & (1 << category)))
8403                     {
8404                       if (category == coding_category_utf_16_auto)
8405                         {
8406                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8407                             category = coding_category_utf_16_le;
8408                           else
8409                             category = coding_category_utf_16_be;
8410                         }
8411                       break;
8412                     }
8413                 }
8414             }
8415         }
8416
8417       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8418           || null_byte_found)
8419         {
8420           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8421           id = CODING_SYSTEM_ID (Qno_conversion);
8422           val = Fcons (make_number (id), Qnil);
8423         }
8424       else if (! detect_info.rejected && ! detect_info.found)
8425         {
8426           detect_info.found = CATEGORY_MASK_ANY;
8427           id = coding_categories[coding_category_undecided].id;
8428           val = Fcons (make_number (id), Qnil);
8429         }
8430       else if (highest)
8431         {
8432           if (detect_info.found)
8433             {
8434               detect_info.found = 1 << category;
8435               val = Fcons (make_number (this->id), Qnil);
8436             }
8437           else
8438             for (i = 0; i < coding_category_raw_text; i++)
8439               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8440                 {
8441                   detect_info.found = 1 << coding_priorities[i];
8442                   id = coding_categories[coding_priorities[i]].id;
8443                   val = Fcons (make_number (id), Qnil);
8444                   break;
8445                 }
8446         }
8447       else
8448         {
8449           int mask = detect_info.rejected | detect_info.found;
8450           int found = 0;
8451
8452           for (i = coding_category_raw_text - 1; i >= 0; i--)
8453             {
8454               category = coding_priorities[i];
8455               if (! (mask & (1 << category)))
8456                 {
8457                   found |= 1 << category;
8458                   id = coding_categories[category].id;
8459                   if (id >= 0)
8460                     val = Fcons (make_number (id), val);
8461                 }
8462             }
8463           for (i = coding_category_raw_text - 1; i >= 0; i--)
8464             {
8465               category = coding_priorities[i];
8466               if (detect_info.found & (1 << category))
8467                 {
8468                   id = coding_categories[category].id;
8469                   val = Fcons (make_number (id), val);
8470                 }
8471             }
8472           detect_info.found |= found;
8473         }
8474     }
8475   else if (base_category == coding_category_utf_8_auto)
8476     {
8477       if (detect_coding_utf_8 (&coding, &detect_info))
8478         {
8479           struct coding_system *this;
8480
8481           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8482             this = coding_categories + coding_category_utf_8_sig;
8483           else
8484             this = coding_categories + coding_category_utf_8_nosig;
8485           val = Fcons (make_number (this->id), Qnil);
8486         }
8487     }
8488   else if (base_category == coding_category_utf_16_auto)
8489     {
8490       if (detect_coding_utf_16 (&coding, &detect_info))
8491         {
8492           struct coding_system *this;
8493
8494           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8495             this = coding_categories + coding_category_utf_16_le;
8496           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8497             this = coding_categories + coding_category_utf_16_be;
8498           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8499             this = coding_categories + coding_category_utf_16_be_nosig;
8500           else
8501             this = coding_categories + coding_category_utf_16_le_nosig;
8502           val = Fcons (make_number (this->id), Qnil);
8503         }
8504     }
8505   else
8506     {
8507       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8508       val = Fcons (make_number (coding.id), Qnil);
8509     }
8510
8511   /* Then, detect eol-format if necessary.  */
8512   {
8513     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8514     Lisp_Object tail;
8515
8516     if (VECTORP (eol_type))
8517       {
8518         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8519           {
8520             if (null_byte_found)
8521               normal_eol = EOL_SEEN_LF;
8522             else
8523               normal_eol = detect_eol (coding.source, src_bytes,
8524                                        coding_category_raw_text);
8525           }
8526         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8527                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8528           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8529                                       coding_category_utf_16_be);
8530         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8531                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8532           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8533                                       coding_category_utf_16_le);
8534       }
8535     else
8536       {
8537         if (EQ (eol_type, Qunix))
8538           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8539         else if (EQ (eol_type, Qdos))
8540           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8541         else
8542           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8543       }
8544
8545     for (tail = val; CONSP (tail); tail = XCDR (tail))
8546       {
8547         enum coding_category category;
8548         int this_eol;
8549
8550         id = XINT (XCAR (tail));
8551         attrs = CODING_ID_ATTRS (id);
8552         category = XINT (CODING_ATTR_CATEGORY (attrs));
8553         eol_type = CODING_ID_EOL_TYPE (id);
8554         if (VECTORP (eol_type))
8555           {
8556             if (category == coding_category_utf_16_be
8557                 || category == coding_category_utf_16_be_nosig)
8558               this_eol = utf_16_be_eol;
8559             else if (category == coding_category_utf_16_le
8560                      || category == coding_category_utf_16_le_nosig)
8561               this_eol = utf_16_le_eol;
8562             else
8563               this_eol = normal_eol;
8564
8565             if (this_eol == EOL_SEEN_LF)
8566               XSETCAR (tail, AREF (eol_type, 0));
8567             else if (this_eol == EOL_SEEN_CRLF)
8568               XSETCAR (tail, AREF (eol_type, 1));
8569             else if (this_eol == EOL_SEEN_CR)
8570               XSETCAR (tail, AREF (eol_type, 2));
8571             else
8572               XSETCAR (tail, CODING_ID_NAME (id));
8573           }
8574         else
8575           XSETCAR (tail, CODING_ID_NAME (id));
8576       }
8577   }
8578
8579   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8580 }
8581
8582
8583 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8584        2, 3, 0,
8585        doc: /* Detect coding system of the text in the region between START and END.
8586 Return a list of possible coding systems ordered by priority.
8587 The coding systems to try and their priorities follows what
8588 the function `coding-system-priority-list' (which see) returns.
8589
8590 If only ASCII characters are found (except for such ISO-2022 control
8591 characters as ESC), it returns a list of single element `undecided'
8592 or its subsidiary coding system according to a detected end-of-line
8593 format.
8594
8595 If optional argument HIGHEST is non-nil, return the coding system of
8596 highest priority.  */)
8597      (start, end, highest)
8598      Lisp_Object start, end, highest;
8599 {
8600   int from, to;
8601   int from_byte, to_byte;
8602
8603   CHECK_NUMBER_COERCE_MARKER (start);
8604   CHECK_NUMBER_COERCE_MARKER (end);
8605
8606   validate_region (&start, &end);
8607   from = XINT (start), to = XINT (end);
8608   from_byte = CHAR_TO_BYTE (from);
8609   to_byte = CHAR_TO_BYTE (to);
8610
8611   if (from < GPT && to >= GPT)
8612     move_gap_both (to, to_byte);
8613
8614   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8615                                to - from, to_byte - from_byte,
8616                                !NILP (highest),
8617                                !NILP (current_buffer
8618                                       ->enable_multibyte_characters),
8619                                Qnil);
8620 }
8621
8622 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8623        1, 2, 0,
8624        doc: /* Detect coding system of the text in STRING.
8625 Return a list of possible coding systems ordered by priority.
8626 The coding systems to try and their priorities follows what
8627 the function `coding-system-priority-list' (which see) returns.
8628
8629 If only ASCII characters are found (except for such ISO-2022 control
8630 characters as ESC), it returns a list of single element `undecided'
8631 or its subsidiary coding system according to a detected end-of-line
8632 format.
8633
8634 If optional argument HIGHEST is non-nil, return the coding system of
8635 highest priority.  */)
8636      (string, highest)
8637      Lisp_Object string, highest;
8638 {
8639   CHECK_STRING (string);
8640
8641   return detect_coding_system (SDATA (string),
8642                                SCHARS (string), SBYTES (string),
8643                                !NILP (highest), STRING_MULTIBYTE (string),
8644                                Qnil);
8645 }
8646
8647
8648 static INLINE int
8649 char_encodable_p (c, attrs)
8650      int c;
8651      Lisp_Object attrs;
8652 {
8653   Lisp_Object tail;
8654   struct charset *charset;
8655   Lisp_Object translation_table;
8656
8657   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8658   if (! NILP (translation_table))
8659     c = translate_char (translation_table, c);
8660   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8661        CONSP (tail); tail = XCDR (tail))
8662     {
8663       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8664       if (CHAR_CHARSET_P (c, charset))
8665         break;
8666     }
8667   return (! NILP (tail));
8668 }
8669
8670
8671 /* Return a list of coding systems that safely encode the text between
8672    START and END.  If EXCLUDE is non-nil, it is a list of coding
8673    systems not to check.  The returned list doesn't contain any such
8674    coding systems.  In any case, if the text contains only ASCII or is
8675    unibyte, return t.  */
8676
8677 DEFUN ("find-coding-systems-region-internal",
8678        Ffind_coding_systems_region_internal,
8679        Sfind_coding_systems_region_internal, 2, 3, 0,
8680        doc: /* Internal use only.  */)
8681      (start, end, exclude)
8682      Lisp_Object start, end, exclude;
8683 {
8684   Lisp_Object coding_attrs_list, safe_codings;
8685   EMACS_INT start_byte, end_byte;
8686   const unsigned char *p, *pbeg, *pend;
8687   int c;
8688   Lisp_Object tail, elt, work_table;
8689
8690   if (STRINGP (start))
8691     {
8692       if (!STRING_MULTIBYTE (start)
8693           || SCHARS (start) == SBYTES (start))
8694         return Qt;
8695       start_byte = 0;
8696       end_byte = SBYTES (start);
8697     }
8698   else
8699     {
8700       CHECK_NUMBER_COERCE_MARKER (start);
8701       CHECK_NUMBER_COERCE_MARKER (end);
8702       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8703         args_out_of_range (start, end);
8704       if (NILP (current_buffer->enable_multibyte_characters))
8705         return Qt;
8706       start_byte = CHAR_TO_BYTE (XINT (start));
8707       end_byte = CHAR_TO_BYTE (XINT (end));
8708       if (XINT (end) - XINT (start) == end_byte - start_byte)
8709         return Qt;
8710
8711       if (XINT (start) < GPT && XINT (end) > GPT)
8712         {
8713           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8714             move_gap_both (XINT (start), start_byte);
8715           else
8716             move_gap_both (XINT (end), end_byte);
8717         }
8718     }
8719
8720   coding_attrs_list = Qnil;
8721   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8722     if (NILP (exclude)
8723         || NILP (Fmemq (XCAR (tail), exclude)))
8724       {
8725         Lisp_Object attrs;
8726
8727         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8728         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8729             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8730           {
8731             ASET (attrs, coding_attr_trans_tbl,
8732                   get_translation_table (attrs, 1, NULL));
8733             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8734           }
8735       }
8736
8737   if (STRINGP (start))
8738     p = pbeg = SDATA (start);
8739   else
8740     p = pbeg = BYTE_POS_ADDR (start_byte);
8741   pend = p + (end_byte - start_byte);
8742
8743   while (p < pend && ASCII_BYTE_P (*p)) p++;
8744   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8745
8746   work_table = Fmake_char_table (Qnil, Qnil);
8747   while (p < pend)
8748     {
8749       if (ASCII_BYTE_P (*p))
8750         p++;
8751       else
8752         {
8753           c = STRING_CHAR_ADVANCE (p);
8754           if (!NILP (char_table_ref (work_table, c)))
8755             /* This character was already checked.  Ignore it.  */
8756             continue;
8757
8758           charset_map_loaded = 0;
8759           for (tail = coding_attrs_list; CONSP (tail);)
8760             {
8761               elt = XCAR (tail);
8762               if (NILP (elt))
8763                 tail = XCDR (tail);
8764               else if (char_encodable_p (c, elt))
8765                 tail = XCDR (tail);
8766               else if (CONSP (XCDR (tail)))
8767                 {
8768                   XSETCAR (tail, XCAR (XCDR (tail)));
8769                   XSETCDR (tail, XCDR (XCDR (tail)));
8770                 }
8771               else
8772                 {
8773                   XSETCAR (tail, Qnil);
8774                   tail = XCDR (tail);
8775                 }
8776             }
8777           if (charset_map_loaded)
8778             {
8779               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8780
8781               if (STRINGP (start))
8782                 pbeg = SDATA (start);
8783               else
8784                 pbeg = BYTE_POS_ADDR (start_byte);
8785               p = pbeg + p_offset;
8786               pend = pbeg + pend_offset;
8787             }
8788           char_table_set (work_table, c, Qt);
8789         }
8790     }
8791
8792   safe_codings = list2 (Qraw_text, Qno_conversion);
8793   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8794     if (! NILP (XCAR (tail)))
8795       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8796
8797   return safe_codings;
8798 }
8799
8800
8801 DEFUN ("unencodable-char-position", Funencodable_char_position,
8802        Sunencodable_char_position, 3, 5, 0,
8803        doc: /*
8804 Return position of first un-encodable character in a region.
8805 START and END specify the region and CODING-SYSTEM specifies the
8806 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8807
8808 If optional 4th argument COUNT is non-nil, it specifies at most how
8809 many un-encodable characters to search.  In this case, the value is a
8810 list of positions.
8811
8812 If optional 5th argument STRING is non-nil, it is a string to search
8813 for un-encodable characters.  In that case, START and END are indexes
8814 to the string.  */)
8815      (start, end, coding_system, count, string)
8816      Lisp_Object start, end, coding_system, count, string;
8817 {
8818   int n;
8819   struct coding_system coding;
8820   Lisp_Object attrs, charset_list, translation_table;
8821   Lisp_Object positions;
8822   int from, to;
8823   const unsigned char *p, *stop, *pend;
8824   int ascii_compatible;
8825
8826   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8827   attrs = CODING_ID_ATTRS (coding.id);
8828   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8829     return Qnil;
8830   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8831   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8832   translation_table = get_translation_table (attrs, 1, NULL);
8833
8834   if (NILP (string))
8835     {
8836       validate_region (&start, &end);
8837       from = XINT (start);
8838       to = XINT (end);
8839       if (NILP (current_buffer->enable_multibyte_characters)
8840           || (ascii_compatible
8841               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8842         return Qnil;
8843       p = CHAR_POS_ADDR (from);
8844       pend = CHAR_POS_ADDR (to);
8845       if (from < GPT && to >= GPT)
8846         stop = GPT_ADDR;
8847       else
8848         stop = pend;
8849     }
8850   else
8851     {
8852       CHECK_STRING (string);
8853       CHECK_NATNUM (start);
8854       CHECK_NATNUM (end);
8855       from = XINT (start);
8856       to = XINT (end);
8857       if (from > to
8858           || to > SCHARS (string))
8859         args_out_of_range_3 (string, start, end);
8860       if (! STRING_MULTIBYTE (string))
8861         return Qnil;
8862       p = SDATA (string) + string_char_to_byte (string, from);
8863       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8864       if (ascii_compatible && (to - from) == (pend - p))
8865         return Qnil;
8866     }
8867
8868   if (NILP (count))
8869     n = 1;
8870   else
8871     {
8872       CHECK_NATNUM (count);
8873       n = XINT (count);
8874     }
8875
8876   positions = Qnil;
8877   while (1)
8878     {
8879       int c;
8880
8881       if (ascii_compatible)
8882         while (p < stop && ASCII_BYTE_P (*p))
8883           p++, from++;
8884       if (p >= stop)
8885         {
8886           if (p >= pend)
8887             break;
8888           stop = pend;
8889           p = GAP_END_ADDR;
8890         }
8891
8892       c = STRING_CHAR_ADVANCE (p);
8893       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8894           && ! char_charset (translate_char (translation_table, c),
8895                              charset_list, NULL))
8896         {
8897           positions = Fcons (make_number (from), positions);
8898           n--;
8899           if (n == 0)
8900             break;
8901         }
8902
8903       from++;
8904     }
8905
8906   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8907 }
8908
8909
8910 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8911        Scheck_coding_systems_region, 3, 3, 0,
8912        doc: /* Check if the region is encodable by coding systems.
8913
8914 START and END are buffer positions specifying the region.
8915 CODING-SYSTEM-LIST is a list of coding systems to check.
8916
8917 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8918 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8919 whole region, POS0, POS1, ... are buffer positions where non-encodable
8920 characters are found.
8921
8922 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8923 value is nil.
8924
8925 START may be a string.  In that case, check if the string is
8926 encodable, and the value contains indices to the string instead of
8927 buffer positions.  END is ignored.
8928
8929 If the current buffer (or START if it is a string) is unibyte, the value
8930 is nil.  */)
8931      (start, end, coding_system_list)
8932      Lisp_Object start, end, coding_system_list;
8933 {
8934   Lisp_Object list;
8935   EMACS_INT start_byte, end_byte;
8936   int pos;
8937   const unsigned char *p, *pbeg, *pend;
8938   int c;
8939   Lisp_Object tail, elt, attrs;
8940
8941   if (STRINGP (start))
8942     {
8943       if (!STRING_MULTIBYTE (start)
8944           || SCHARS (start) == SBYTES (start))
8945         return Qnil;
8946       start_byte = 0;
8947       end_byte = SBYTES (start);
8948       pos = 0;
8949     }
8950   else
8951     {
8952       CHECK_NUMBER_COERCE_MARKER (start);
8953       CHECK_NUMBER_COERCE_MARKER (end);
8954       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8955         args_out_of_range (start, end);
8956       if (NILP (current_buffer->enable_multibyte_characters))
8957         return Qnil;
8958       start_byte = CHAR_TO_BYTE (XINT (start));
8959       end_byte = CHAR_TO_BYTE (XINT (end));
8960       if (XINT (end) - XINT (start) == end_byte - start_byte)
8961         return Qnil;
8962
8963       if (XINT (start) < GPT && XINT (end) > GPT)
8964         {
8965           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8966             move_gap_both (XINT (start), start_byte);
8967           else
8968             move_gap_both (XINT (end), end_byte);
8969         }
8970       pos = XINT (start);
8971     }
8972
8973   list = Qnil;
8974   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8975     {
8976       elt = XCAR (tail);
8977       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8978       ASET (attrs, coding_attr_trans_tbl,
8979             get_translation_table (attrs, 1, NULL));
8980       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8981     }
8982
8983   if (STRINGP (start))
8984     p = pbeg = SDATA (start);
8985   else
8986     p = pbeg = BYTE_POS_ADDR (start_byte);
8987   pend = p + (end_byte - start_byte);
8988
8989   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8990   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8991
8992   while (p < pend)
8993     {
8994       if (ASCII_BYTE_P (*p))
8995         p++;
8996       else
8997         {
8998           c = STRING_CHAR_ADVANCE (p);
8999
9000           charset_map_loaded = 0;
9001           for (tail = list; CONSP (tail); tail = XCDR (tail))
9002             {
9003               elt = XCDR (XCAR (tail));
9004               if (! char_encodable_p (c, XCAR (elt)))
9005                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9006             }
9007           if (charset_map_loaded)
9008             {
9009               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
9010
9011               if (STRINGP (start))
9012                 pbeg = SDATA (start);
9013               else
9014                 pbeg = BYTE_POS_ADDR (start_byte);
9015               p = pbeg + p_offset;
9016               pend = pbeg + pend_offset;
9017             }
9018         }
9019       pos++;
9020     }
9021
9022   tail = list;
9023   list = Qnil;
9024   for (; CONSP (tail); tail = XCDR (tail))
9025     {
9026       elt = XCAR (tail);
9027       if (CONSP (XCDR (XCDR (elt))))
9028         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9029                       list);
9030     }
9031
9032   return list;
9033 }
9034
9035
9036 Lisp_Object
9037 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9038      Lisp_Object start, end, coding_system, dst_object;
9039      int encodep, norecord;
9040 {
9041   struct coding_system coding;
9042   EMACS_INT from, from_byte, to, to_byte;
9043   Lisp_Object src_object;
9044
9045   CHECK_NUMBER_COERCE_MARKER (start);
9046   CHECK_NUMBER_COERCE_MARKER (end);
9047   if (NILP (coding_system))
9048     coding_system = Qno_conversion;
9049   else
9050     CHECK_CODING_SYSTEM (coding_system);
9051   src_object = Fcurrent_buffer ();
9052   if (NILP (dst_object))
9053     dst_object = src_object;
9054   else if (! EQ (dst_object, Qt))
9055     CHECK_BUFFER (dst_object);
9056
9057   validate_region (&start, &end);
9058   from = XFASTINT (start);
9059   from_byte = CHAR_TO_BYTE (from);
9060   to = XFASTINT (end);
9061   to_byte = CHAR_TO_BYTE (to);
9062
9063   setup_coding_system (coding_system, &coding);
9064   coding.mode |= CODING_MODE_LAST_BLOCK;
9065
9066   if (encodep)
9067     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9068                           dst_object);
9069   else
9070     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9071                           dst_object);
9072   if (! norecord)
9073     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9074
9075   return (BUFFERP (dst_object)
9076           ? make_number (coding.produced_char)
9077           : coding.dst_object);
9078 }
9079
9080
9081 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9082        3, 4, "r\nzCoding system: ",
9083        doc: /* Decode the current region from the specified coding system.
9084 When called from a program, takes four arguments:
9085         START, END, CODING-SYSTEM, and DESTINATION.
9086 START and END are buffer positions.
9087
9088 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9089 If nil, the region between START and END is replaced by the decoded text.
9090 If buffer, the decoded text is inserted in that buffer after point (point
9091 does not move).
9092 In those cases, the length of the decoded text is returned.
9093 If DESTINATION is t, the decoded text is returned.
9094
9095 This function sets `last-coding-system-used' to the precise coding system
9096 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9097 not fully specified.)  */)
9098      (start, end, coding_system, destination)
9099      Lisp_Object start, end, coding_system, destination;
9100 {
9101   return code_convert_region (start, end, coding_system, destination, 0, 0);
9102 }
9103
9104 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9105        3, 4, "r\nzCoding system: ",
9106        doc: /* Encode the current region by specified coding system.
9107 When called from a program, takes four arguments:
9108         START, END, CODING-SYSTEM and DESTINATION.
9109 START and END are buffer positions.
9110
9111 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9112 If nil, the region between START and END is replace by the encoded text.
9113 If buffer, the encoded text is inserted in that buffer after point (point
9114 does not move).
9115 In those cases, the length of the encoded text is returned.
9116 If DESTINATION is t, the encoded text is returned.
9117
9118 This function sets `last-coding-system-used' to the precise coding system
9119 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9120 not fully specified.)  */)
9121   (start, end, coding_system, destination)
9122      Lisp_Object start, end, coding_system, destination;
9123 {
9124   return code_convert_region (start, end, coding_system, destination, 1, 0);
9125 }
9126
9127 Lisp_Object
9128 code_convert_string (string, coding_system, dst_object,
9129                      encodep, nocopy, norecord)
9130      Lisp_Object string, coding_system, dst_object;
9131      int encodep, nocopy, norecord;
9132 {
9133   struct coding_system coding;
9134   EMACS_INT chars, bytes;
9135
9136   CHECK_STRING (string);
9137   if (NILP (coding_system))
9138     {
9139       if (! norecord)
9140         Vlast_coding_system_used = Qno_conversion;
9141       if (NILP (dst_object))
9142         return (nocopy ? Fcopy_sequence (string) : string);
9143     }
9144
9145   if (NILP (coding_system))
9146     coding_system = Qno_conversion;
9147   else
9148     CHECK_CODING_SYSTEM (coding_system);
9149   if (NILP (dst_object))
9150     dst_object = Qt;
9151   else if (! EQ (dst_object, Qt))
9152     CHECK_BUFFER (dst_object);
9153
9154   setup_coding_system (coding_system, &coding);
9155   coding.mode |= CODING_MODE_LAST_BLOCK;
9156   chars = SCHARS (string);
9157   bytes = SBYTES (string);
9158   if (encodep)
9159     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9160   else
9161     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9162   if (! norecord)
9163     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9164
9165   return (BUFFERP (dst_object)
9166           ? make_number (coding.produced_char)
9167           : coding.dst_object);
9168 }
9169
9170
9171 /* Encode or decode STRING according to CODING_SYSTEM.
9172    Do not set Vlast_coding_system_used.
9173
9174    This function is called only from macros DECODE_FILE and
9175    ENCODE_FILE, thus we ignore character composition.  */
9176
9177 Lisp_Object
9178 code_convert_string_norecord (string, coding_system, encodep)
9179      Lisp_Object string, coding_system;
9180      int encodep;
9181 {
9182   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9183 }
9184
9185
9186 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9187        2, 4, 0,
9188        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9189
9190 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9191 if the decoding operation is trivial.
9192
9193 Optional fourth arg BUFFER non-nil means that the decoded text is
9194 inserted in that buffer after point (point does not move).  In this
9195 case, the return value is the length of the decoded text.
9196
9197 This function sets `last-coding-system-used' to the precise coding system
9198 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9199 not fully specified.)  */)
9200   (string, coding_system, nocopy, buffer)
9201      Lisp_Object string, coding_system, nocopy, buffer;
9202 {
9203   return code_convert_string (string, coding_system, buffer,
9204                               0, ! NILP (nocopy), 0);
9205 }
9206
9207 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9208        2, 4, 0,
9209        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9210
9211 Optional third arg NOCOPY non-nil means it is OK to return STRING
9212 itself if the encoding operation is trivial.
9213
9214 Optional fourth arg BUFFER non-nil means that the encoded text is
9215 inserted in that buffer after point (point does not move).  In this
9216 case, the return value is the length of the encoded text.
9217
9218 This function sets `last-coding-system-used' to the precise coding system
9219 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9220 not fully specified.)  */)
9221      (string, coding_system, nocopy, buffer)
9222      Lisp_Object string, coding_system, nocopy, buffer;
9223 {
9224   return code_convert_string (string, coding_system, buffer,
9225                               1, ! NILP (nocopy), 1);
9226 }
9227
9228 \f
9229 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9230        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9231 Return the corresponding character.  */)
9232      (code)
9233      Lisp_Object code;
9234 {
9235   Lisp_Object spec, attrs, val;
9236   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9237   int c;
9238
9239   CHECK_NATNUM (code);
9240   c = XFASTINT (code);
9241   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9242   attrs = AREF (spec, 0);
9243
9244   if (ASCII_BYTE_P (c)
9245       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9246     return code;
9247
9248   val = CODING_ATTR_CHARSET_LIST (attrs);
9249   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9250   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9251   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9252
9253   if (c <= 0x7F)
9254     charset = charset_roman;
9255   else if (c >= 0xA0 && c < 0xDF)
9256     {
9257       charset = charset_kana;
9258       c -= 0x80;
9259     }
9260   else
9261     {
9262       int s1 = c >> 8, s2 = c & 0xFF;
9263
9264       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9265           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9266         error ("Invalid code: %d", code);
9267       SJIS_TO_JIS (c);
9268       charset = charset_kanji;
9269     }
9270   c = DECODE_CHAR (charset, c);
9271   if (c < 0)
9272     error ("Invalid code: %d", code);
9273   return make_number (c);
9274 }
9275
9276
9277 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9278        doc: /* Encode a Japanese character CH to shift_jis encoding.
9279 Return the corresponding code in SJIS.  */)
9280      (ch)
9281     Lisp_Object ch;
9282 {
9283   Lisp_Object spec, attrs, charset_list;
9284   int c;
9285   struct charset *charset;
9286   unsigned code;
9287
9288   CHECK_CHARACTER (ch);
9289   c = XFASTINT (ch);
9290   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9291   attrs = AREF (spec, 0);
9292
9293   if (ASCII_CHAR_P (c)
9294       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9295     return ch;
9296
9297   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9298   charset = char_charset (c, charset_list, &code);
9299   if (code == CHARSET_INVALID_CODE (charset))
9300     error ("Can't encode by shift_jis encoding: %d", c);
9301   JIS_TO_SJIS (code);
9302
9303   return make_number (code);
9304 }
9305
9306 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9307        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9308 Return the corresponding character.  */)
9309      (code)
9310      Lisp_Object code;
9311 {
9312   Lisp_Object spec, attrs, val;
9313   struct charset *charset_roman, *charset_big5, *charset;
9314   int c;
9315
9316   CHECK_NATNUM (code);
9317   c = XFASTINT (code);
9318   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9319   attrs = AREF (spec, 0);
9320
9321   if (ASCII_BYTE_P (c)
9322       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9323     return code;
9324
9325   val = CODING_ATTR_CHARSET_LIST (attrs);
9326   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9327   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9328
9329   if (c <= 0x7F)
9330     charset = charset_roman;
9331   else
9332     {
9333       int b1 = c >> 8, b2 = c & 0x7F;
9334       if (b1 < 0xA1 || b1 > 0xFE
9335           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9336         error ("Invalid code: %d", code);
9337       charset = charset_big5;
9338     }
9339   c = DECODE_CHAR (charset, (unsigned )c);
9340   if (c < 0)
9341     error ("Invalid code: %d", code);
9342   return make_number (c);
9343 }
9344
9345 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9346        doc: /* Encode the Big5 character CH to BIG5 coding system.
9347 Return the corresponding character code in Big5.  */)
9348      (ch)
9349      Lisp_Object ch;
9350 {
9351   Lisp_Object spec, attrs, charset_list;
9352   struct charset *charset;
9353   int c;
9354   unsigned code;
9355
9356   CHECK_CHARACTER (ch);
9357   c = XFASTINT (ch);
9358   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9359   attrs = AREF (spec, 0);
9360   if (ASCII_CHAR_P (c)
9361       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9362     return ch;
9363
9364   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9365   charset = char_charset (c, charset_list, &code);
9366   if (code == CHARSET_INVALID_CODE (charset))
9367     error ("Can't encode by Big5 encoding: %d", c);
9368
9369   return make_number (code);
9370 }
9371
9372 \f
9373 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9374        Sset_terminal_coding_system_internal, 1, 2, 0,
9375        doc: /* Internal use only.  */)
9376      (coding_system, terminal)
9377      Lisp_Object coding_system;
9378      Lisp_Object terminal;
9379 {
9380   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9381   CHECK_SYMBOL (coding_system);
9382   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9383   /* We had better not send unsafe characters to terminal.  */
9384   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9385   /* Characer composition should be disabled.  */
9386   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9387   terminal_coding->src_multibyte = 1;
9388   terminal_coding->dst_multibyte = 0;
9389   return Qnil;
9390 }
9391
9392 DEFUN ("set-safe-terminal-coding-system-internal",
9393        Fset_safe_terminal_coding_system_internal,
9394        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9395        doc: /* Internal use only.  */)
9396      (coding_system)
9397      Lisp_Object coding_system;
9398 {
9399   CHECK_SYMBOL (coding_system);
9400   setup_coding_system (Fcheck_coding_system (coding_system),
9401                        &safe_terminal_coding);
9402   /* Characer composition should be disabled.  */
9403   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9404   safe_terminal_coding.src_multibyte = 1;
9405   safe_terminal_coding.dst_multibyte = 0;
9406   return Qnil;
9407 }
9408
9409 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9410        Sterminal_coding_system, 0, 1, 0,
9411        doc: /* Return coding system specified for terminal output on the given terminal.
9412 TERMINAL may be a terminal object, a frame, or nil for the selected
9413 frame's terminal device.  */)
9414      (terminal)
9415      Lisp_Object terminal;
9416 {
9417   struct coding_system *terminal_coding
9418     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9419   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9420
9421   /* For backward compatibility, return nil if it is `undecided'. */
9422   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9423 }
9424
9425 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9426        Sset_keyboard_coding_system_internal, 1, 2, 0,
9427        doc: /* Internal use only.  */)
9428      (coding_system, terminal)
9429      Lisp_Object coding_system;
9430      Lisp_Object terminal;
9431 {
9432   struct terminal *t = get_terminal (terminal, 1);
9433   CHECK_SYMBOL (coding_system);
9434   if (NILP (coding_system))
9435     coding_system = Qno_conversion;
9436   else
9437     Fcheck_coding_system (coding_system);
9438   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9439   /* Characer composition should be disabled.  */
9440   TERMINAL_KEYBOARD_CODING (t)->common_flags
9441     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9442   return Qnil;
9443 }
9444
9445 DEFUN ("keyboard-coding-system",
9446        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9447        doc: /* Return coding system specified for decoding keyboard input.  */)
9448      (terminal)
9449      Lisp_Object terminal;
9450 {
9451   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9452                          (get_terminal (terminal, 1))->id);
9453 }
9454
9455 \f
9456 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9457        Sfind_operation_coding_system,  1, MANY, 0,
9458        doc: /* Choose a coding system for an operation based on the target name.
9459 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9460 DECODING-SYSTEM is the coding system to use for decoding
9461 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9462 for encoding (in case OPERATION does encoding).
9463
9464 The first argument OPERATION specifies an I/O primitive:
9465   For file I/O, `insert-file-contents' or `write-region'.
9466   For process I/O, `call-process', `call-process-region', or `start-process'.
9467   For network I/O, `open-network-stream'.
9468
9469 The remaining arguments should be the same arguments that were passed
9470 to the primitive.  Depending on which primitive, one of those arguments
9471 is selected as the TARGET.  For example, if OPERATION does file I/O,
9472 whichever argument specifies the file name is TARGET.
9473
9474 TARGET has a meaning which depends on OPERATION:
9475   For file I/O, TARGET is a file name (except for the special case below).
9476   For process I/O, TARGET is a process name.
9477   For network I/O, TARGET is a service name or a port number.
9478
9479 This function looks up what is specified for TARGET in
9480 `file-coding-system-alist', `process-coding-system-alist',
9481 or `network-coding-system-alist' depending on OPERATION.
9482 They may specify a coding system, a cons of coding systems,
9483 or a function symbol to call.
9484 In the last case, we call the function with one argument,
9485 which is a list of all the arguments given to this function.
9486 If the function can't decide a coding system, it can return
9487 `undecided' so that the normal code-detection is performed.
9488
9489 If OPERATION is `insert-file-contents', the argument corresponding to
9490 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9491 file name to look up, and BUFFER is a buffer that contains the file's
9492 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9493 function to call for FILENAME, that function should examine the
9494 contents of BUFFER instead of reading the file.
9495
9496 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9497      (nargs, args)
9498      int nargs;
9499      Lisp_Object *args;
9500 {
9501   Lisp_Object operation, target_idx, target, val;
9502   register Lisp_Object chain;
9503
9504   if (nargs < 2)
9505     error ("Too few arguments");
9506   operation = args[0];
9507   if (!SYMBOLP (operation)
9508       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9509     error ("Invalid first argument");
9510   if (nargs < 1 + XINT (target_idx))
9511     error ("Too few arguments for operation: %s",
9512            SDATA (SYMBOL_NAME (operation)));
9513   target = args[XINT (target_idx) + 1];
9514   if (!(STRINGP (target)
9515         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9516             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9517         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9518     error ("Invalid %dth argument", XINT (target_idx) + 1);
9519   if (CONSP (target))
9520     target = XCAR (target);
9521
9522   chain = ((EQ (operation, Qinsert_file_contents)
9523             || EQ (operation, Qwrite_region))
9524            ? Vfile_coding_system_alist
9525            : (EQ (operation, Qopen_network_stream)
9526               ? Vnetwork_coding_system_alist
9527               : Vprocess_coding_system_alist));
9528   if (NILP (chain))
9529     return Qnil;
9530
9531   for (; CONSP (chain); chain = XCDR (chain))
9532     {
9533       Lisp_Object elt;
9534
9535       elt = XCAR (chain);
9536       if (CONSP (elt)
9537           && ((STRINGP (target)
9538                && STRINGP (XCAR (elt))
9539                && fast_string_match (XCAR (elt), target) >= 0)
9540               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9541         {
9542           val = XCDR (elt);
9543           /* Here, if VAL is both a valid coding system and a valid
9544              function symbol, we return VAL as a coding system.  */
9545           if (CONSP (val))
9546             return val;
9547           if (! SYMBOLP (val))
9548             return Qnil;
9549           if (! NILP (Fcoding_system_p (val)))
9550             return Fcons (val, val);
9551           if (! NILP (Ffboundp (val)))
9552             {
9553               /* We use call1 rather than safe_call1
9554                  so as to get bug reports about functions called here
9555                  which don't handle the current interface.  */
9556               val = call1 (val, Flist (nargs, args));
9557               if (CONSP (val))
9558                 return val;
9559               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9560                 return Fcons (val, val);
9561             }
9562           return Qnil;
9563         }
9564     }
9565   return Qnil;
9566 }
9567
9568 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9569        Sset_coding_system_priority, 0, MANY, 0,
9570        doc: /* Assign higher priority to the coding systems given as arguments.
9571 If multiple coding systems belong to the same category,
9572 all but the first one are ignored.
9573
9574 usage: (set-coding-system-priority &rest coding-systems)  */)
9575      (nargs, args)
9576      int nargs;
9577      Lisp_Object *args;
9578 {
9579   int i, j;
9580   int changed[coding_category_max];
9581   enum coding_category priorities[coding_category_max];
9582
9583   bzero (changed, sizeof changed);
9584
9585   for (i = j = 0; i < nargs; i++)
9586     {
9587       enum coding_category category;
9588       Lisp_Object spec, attrs;
9589
9590       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9591       attrs = AREF (spec, 0);
9592       category = XINT (CODING_ATTR_CATEGORY (attrs));
9593       if (changed[category])
9594         /* Ignore this coding system because a coding system of the
9595            same category already had a higher priority.  */
9596         continue;
9597       changed[category] = 1;
9598       priorities[j++] = category;
9599       if (coding_categories[category].id >= 0
9600           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9601         setup_coding_system (args[i], &coding_categories[category]);
9602       Fset (AREF (Vcoding_category_table, category), args[i]);
9603     }
9604
9605   /* Now we have decided top J priorities.  Reflect the order of the
9606      original priorities to the remaining priorities.  */
9607
9608   for (i = j, j = 0; i < coding_category_max; i++, j++)
9609     {
9610       while (j < coding_category_max
9611              && changed[coding_priorities[j]])
9612         j++;
9613       if (j == coding_category_max)
9614         abort ();
9615       priorities[i] = coding_priorities[j];
9616     }
9617
9618   bcopy (priorities, coding_priorities, sizeof priorities);
9619
9620   /* Update `coding-category-list'.  */
9621   Vcoding_category_list = Qnil;
9622   for (i = coding_category_max - 1; i >= 0; i--)
9623     Vcoding_category_list
9624       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9625                Vcoding_category_list);
9626
9627   return Qnil;
9628 }
9629
9630 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9631        Scoding_system_priority_list, 0, 1, 0,
9632        doc: /* Return a list of coding systems ordered by their priorities.
9633 The list contains a subset of coding systems; i.e. coding systems
9634 assigned to each coding category (see `coding-category-list').
9635
9636 HIGHESTP non-nil means just return the highest priority one.  */)
9637      (highestp)
9638      Lisp_Object highestp;
9639 {
9640   int i;
9641   Lisp_Object val;
9642
9643   for (i = 0, val = Qnil; i < coding_category_max; i++)
9644     {
9645       enum coding_category category = coding_priorities[i];
9646       int id = coding_categories[category].id;
9647       Lisp_Object attrs;
9648
9649       if (id < 0)
9650         continue;
9651       attrs = CODING_ID_ATTRS (id);
9652       if (! NILP (highestp))
9653         return CODING_ATTR_BASE_NAME (attrs);
9654       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9655     }
9656   return Fnreverse (val);
9657 }
9658
9659 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9660
9661 static Lisp_Object
9662 make_subsidiaries (base)
9663      Lisp_Object base;
9664 {
9665   Lisp_Object subsidiaries;
9666   int base_name_len = SBYTES (SYMBOL_NAME (base));
9667   char *buf = (char *) alloca (base_name_len + 6);
9668   int i;
9669
9670   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9671   subsidiaries = Fmake_vector (make_number (3), Qnil);
9672   for (i = 0; i < 3; i++)
9673     {
9674       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9675       ASET (subsidiaries, i, intern (buf));
9676     }
9677   return subsidiaries;
9678 }
9679
9680
9681 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9682        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9683        doc: /* For internal use only.
9684 usage: (define-coding-system-internal ...)  */)
9685      (nargs, args)
9686      int nargs;
9687      Lisp_Object *args;
9688 {
9689   Lisp_Object name;
9690   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9691   Lisp_Object attrs;            /* Vector of attributes.  */
9692   Lisp_Object eol_type;
9693   Lisp_Object aliases;
9694   Lisp_Object coding_type, charset_list, safe_charsets;
9695   enum coding_category category;
9696   Lisp_Object tail, val;
9697   int max_charset_id = 0;
9698   int i;
9699
9700   if (nargs < coding_arg_max)
9701     goto short_args;
9702
9703   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9704
9705   name = args[coding_arg_name];
9706   CHECK_SYMBOL (name);
9707   CODING_ATTR_BASE_NAME (attrs) = name;
9708
9709   val = args[coding_arg_mnemonic];
9710   if (! STRINGP (val))
9711     CHECK_CHARACTER (val);
9712   CODING_ATTR_MNEMONIC (attrs) = val;
9713
9714   coding_type = args[coding_arg_coding_type];
9715   CHECK_SYMBOL (coding_type);
9716   CODING_ATTR_TYPE (attrs) = coding_type;
9717
9718   charset_list = args[coding_arg_charset_list];
9719   if (SYMBOLP (charset_list))
9720     {
9721       if (EQ (charset_list, Qiso_2022))
9722         {
9723           if (! EQ (coding_type, Qiso_2022))
9724             error ("Invalid charset-list");
9725           charset_list = Viso_2022_charset_list;
9726         }
9727       else if (EQ (charset_list, Qemacs_mule))
9728         {
9729           if (! EQ (coding_type, Qemacs_mule))
9730             error ("Invalid charset-list");
9731           charset_list = Vemacs_mule_charset_list;
9732         }
9733       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9734         if (max_charset_id < XFASTINT (XCAR (tail)))
9735           max_charset_id = XFASTINT (XCAR (tail));
9736     }
9737   else
9738     {
9739       charset_list = Fcopy_sequence (charset_list);
9740       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9741         {
9742           struct charset *charset;
9743
9744           val = XCAR (tail);
9745           CHECK_CHARSET_GET_CHARSET (val, charset);
9746           if (EQ (coding_type, Qiso_2022)
9747               ? CHARSET_ISO_FINAL (charset) < 0
9748               : EQ (coding_type, Qemacs_mule)
9749               ? CHARSET_EMACS_MULE_ID (charset) < 0
9750               : 0)
9751             error ("Can't handle charset `%s'",
9752                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9753
9754           XSETCAR (tail, make_number (charset->id));
9755           if (max_charset_id < charset->id)
9756             max_charset_id = charset->id;
9757         }
9758     }
9759   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9760
9761   safe_charsets = make_uninit_string (max_charset_id + 1);
9762   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9763   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9764     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9765   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9766
9767   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9768
9769   val = args[coding_arg_decode_translation_table];
9770   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9771     CHECK_SYMBOL (val);
9772   CODING_ATTR_DECODE_TBL (attrs) = val;
9773
9774   val = args[coding_arg_encode_translation_table];
9775   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9776     CHECK_SYMBOL (val);
9777   CODING_ATTR_ENCODE_TBL (attrs) = val;
9778
9779   val = args[coding_arg_post_read_conversion];
9780   CHECK_SYMBOL (val);
9781   CODING_ATTR_POST_READ (attrs) = val;
9782
9783   val = args[coding_arg_pre_write_conversion];
9784   CHECK_SYMBOL (val);
9785   CODING_ATTR_PRE_WRITE (attrs) = val;
9786
9787   val = args[coding_arg_default_char];
9788   if (NILP (val))
9789     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9790   else
9791     {
9792       CHECK_CHARACTER (val);
9793       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9794     }
9795
9796   val = args[coding_arg_for_unibyte];
9797   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9798
9799   val = args[coding_arg_plist];
9800   CHECK_LIST (val);
9801   CODING_ATTR_PLIST (attrs) = val;
9802
9803   if (EQ (coding_type, Qcharset))
9804     {
9805       /* Generate a lisp vector of 256 elements.  Each element is nil,
9806          integer, or a list of charset IDs.
9807
9808          If Nth element is nil, the byte code N is invalid in this
9809          coding system.
9810
9811          If Nth element is a number NUM, N is the first byte of a
9812          charset whose ID is NUM.
9813
9814          If Nth element is a list of charset IDs, N is the first byte
9815          of one of them.  The list is sorted by dimensions of the
9816          charsets.  A charset of smaller dimension comes firtst. */
9817       val = Fmake_vector (make_number (256), Qnil);
9818
9819       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9820         {
9821           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9822           int dim = CHARSET_DIMENSION (charset);
9823           int idx = (dim - 1) * 4;
9824
9825           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9826             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9827
9828           for (i = charset->code_space[idx];
9829                i <= charset->code_space[idx + 1]; i++)
9830             {
9831               Lisp_Object tmp, tmp2;
9832               int dim2;
9833
9834               tmp = AREF (val, i);
9835               if (NILP (tmp))
9836                 tmp = XCAR (tail);
9837               else if (NUMBERP (tmp))
9838                 {
9839                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9840                   if (dim < dim2)
9841                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9842                   else
9843                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9844                 }
9845               else
9846                 {
9847                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9848                     {
9849                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9850                       if (dim < dim2)
9851                         break;
9852                     }
9853                   if (NILP (tmp2))
9854                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9855                   else
9856                     {
9857                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9858                       XSETCAR (tmp2, XCAR (tail));
9859                     }
9860                 }
9861               ASET (val, i, tmp);
9862             }
9863         }
9864       ASET (attrs, coding_attr_charset_valids, val);
9865       category = coding_category_charset;
9866     }
9867   else if (EQ (coding_type, Qccl))
9868     {
9869       Lisp_Object valids;
9870
9871       if (nargs < coding_arg_ccl_max)
9872         goto short_args;
9873
9874       val = args[coding_arg_ccl_decoder];
9875       CHECK_CCL_PROGRAM (val);
9876       if (VECTORP (val))
9877         val = Fcopy_sequence (val);
9878       ASET (attrs, coding_attr_ccl_decoder, val);
9879
9880       val = args[coding_arg_ccl_encoder];
9881       CHECK_CCL_PROGRAM (val);
9882       if (VECTORP (val))
9883         val = Fcopy_sequence (val);
9884       ASET (attrs, coding_attr_ccl_encoder, val);
9885
9886       val = args[coding_arg_ccl_valids];
9887       valids = Fmake_string (make_number (256), make_number (0));
9888       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9889         {
9890           int from, to;
9891
9892           val = Fcar (tail);
9893           if (INTEGERP (val))
9894             {
9895               from = to = XINT (val);
9896               if (from < 0 || from > 255)
9897                 args_out_of_range_3 (val, make_number (0), make_number (255));
9898             }
9899           else
9900             {
9901               CHECK_CONS (val);
9902               CHECK_NATNUM_CAR (val);
9903               CHECK_NATNUM_CDR (val);
9904               from = XINT (XCAR (val));
9905               if (from > 255)
9906                 args_out_of_range_3 (XCAR (val),
9907                                      make_number (0), make_number (255));
9908               to = XINT (XCDR (val));
9909               if (to < from || to > 255)
9910                 args_out_of_range_3 (XCDR (val),
9911                                      XCAR (val), make_number (255));
9912             }
9913           for (i = from; i <= to; i++)
9914             SSET (valids, i, 1);
9915         }
9916       ASET (attrs, coding_attr_ccl_valids, valids);
9917
9918       category = coding_category_ccl;
9919     }
9920   else if (EQ (coding_type, Qutf_16))
9921     {
9922       Lisp_Object bom, endian;
9923
9924       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9925
9926       if (nargs < coding_arg_utf16_max)
9927         goto short_args;
9928
9929       bom = args[coding_arg_utf16_bom];
9930       if (! NILP (bom) && ! EQ (bom, Qt))
9931         {
9932           CHECK_CONS (bom);
9933           val = XCAR (bom);
9934           CHECK_CODING_SYSTEM (val);
9935           val = XCDR (bom);
9936           CHECK_CODING_SYSTEM (val);
9937         }
9938       ASET (attrs, coding_attr_utf_bom, bom);
9939
9940       endian = args[coding_arg_utf16_endian];
9941       CHECK_SYMBOL (endian);
9942       if (NILP (endian))
9943         endian = Qbig;
9944       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9945         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9946       ASET (attrs, coding_attr_utf_16_endian, endian);
9947
9948       category = (CONSP (bom)
9949                   ? coding_category_utf_16_auto
9950                   : NILP (bom)
9951                   ? (EQ (endian, Qbig)
9952                      ? coding_category_utf_16_be_nosig
9953                      : coding_category_utf_16_le_nosig)
9954                   : (EQ (endian, Qbig)
9955                      ? coding_category_utf_16_be
9956                      : coding_category_utf_16_le));
9957     }
9958   else if (EQ (coding_type, Qiso_2022))
9959     {
9960       Lisp_Object initial, reg_usage, request, flags;
9961       int i;
9962
9963       if (nargs < coding_arg_iso2022_max)
9964         goto short_args;
9965
9966       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9967       CHECK_VECTOR (initial);
9968       for (i = 0; i < 4; i++)
9969         {
9970           val = Faref (initial, make_number (i));
9971           if (! NILP (val))
9972             {
9973               struct charset *charset;
9974
9975               CHECK_CHARSET_GET_CHARSET (val, charset);
9976               ASET (initial, i, make_number (CHARSET_ID (charset)));
9977               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9978                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9979             }
9980           else
9981             ASET (initial, i, make_number (-1));
9982         }
9983
9984       reg_usage = args[coding_arg_iso2022_reg_usage];
9985       CHECK_CONS (reg_usage);
9986       CHECK_NUMBER_CAR (reg_usage);
9987       CHECK_NUMBER_CDR (reg_usage);
9988
9989       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9990       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9991         {
9992           int id;
9993           Lisp_Object tmp;
9994
9995           val = Fcar (tail);
9996           CHECK_CONS (val);
9997           tmp = XCAR (val);
9998           CHECK_CHARSET_GET_ID (tmp, id);
9999           CHECK_NATNUM_CDR (val);
10000           if (XINT (XCDR (val)) >= 4)
10001             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
10002           XSETCAR (val, make_number (id));
10003         }
10004
10005       flags = args[coding_arg_iso2022_flags];
10006       CHECK_NATNUM (flags);
10007       i = XINT (flags);
10008       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10009         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
10010
10011       ASET (attrs, coding_attr_iso_initial, initial);
10012       ASET (attrs, coding_attr_iso_usage, reg_usage);
10013       ASET (attrs, coding_attr_iso_request, request);
10014       ASET (attrs, coding_attr_iso_flags, flags);
10015       setup_iso_safe_charsets (attrs);
10016
10017       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10018         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10019                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10020                     ? coding_category_iso_7_else
10021                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10022                     ? coding_category_iso_7
10023                     : coding_category_iso_7_tight);
10024       else
10025         {
10026           int id = XINT (AREF (initial, 1));
10027
10028           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10029                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10030                        || id < 0)
10031                       ? coding_category_iso_8_else
10032                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10033                       ? coding_category_iso_8_1
10034                       : coding_category_iso_8_2);
10035         }
10036       if (category != coding_category_iso_8_1
10037           && category != coding_category_iso_8_2)
10038         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10039     }
10040   else if (EQ (coding_type, Qemacs_mule))
10041     {
10042       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10043         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10044       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10045       category = coding_category_emacs_mule;
10046     }
10047   else if (EQ (coding_type, Qshift_jis))
10048     {
10049
10050       struct charset *charset;
10051
10052       if (XINT (Flength (charset_list)) != 3
10053           && XINT (Flength (charset_list)) != 4)
10054         error ("There should be three or four charsets");
10055
10056       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10057       if (CHARSET_DIMENSION (charset) != 1)
10058         error ("Dimension of charset %s is not one",
10059                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10060       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10061         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10062
10063       charset_list = XCDR (charset_list);
10064       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10065       if (CHARSET_DIMENSION (charset) != 1)
10066         error ("Dimension of charset %s is not one",
10067                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10068
10069       charset_list = XCDR (charset_list);
10070       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10071       if (CHARSET_DIMENSION (charset) != 2)
10072         error ("Dimension of charset %s is not two",
10073                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10074
10075       charset_list = XCDR (charset_list);
10076       if (! NILP (charset_list))
10077         {
10078           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10079           if (CHARSET_DIMENSION (charset) != 2)
10080             error ("Dimension of charset %s is not two",
10081                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10082         }
10083
10084       category = coding_category_sjis;
10085       Vsjis_coding_system = name;
10086     }
10087   else if (EQ (coding_type, Qbig5))
10088     {
10089       struct charset *charset;
10090
10091       if (XINT (Flength (charset_list)) != 2)
10092         error ("There should be just two charsets");
10093
10094       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10095       if (CHARSET_DIMENSION (charset) != 1)
10096         error ("Dimension of charset %s is not one",
10097                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10098       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10099         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10100
10101       charset_list = XCDR (charset_list);
10102       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10103       if (CHARSET_DIMENSION (charset) != 2)
10104         error ("Dimension of charset %s is not two",
10105                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10106
10107       category = coding_category_big5;
10108       Vbig5_coding_system = name;
10109     }
10110   else if (EQ (coding_type, Qraw_text))
10111     {
10112       category = coding_category_raw_text;
10113       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10114     }
10115   else if (EQ (coding_type, Qutf_8))
10116     {
10117       Lisp_Object bom;
10118
10119       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10120
10121       if (nargs < coding_arg_utf8_max)
10122         goto short_args;
10123
10124       bom = args[coding_arg_utf8_bom];
10125       if (! NILP (bom) && ! EQ (bom, Qt))
10126         {
10127           CHECK_CONS (bom);
10128           val = XCAR (bom);
10129           CHECK_CODING_SYSTEM (val);
10130           val = XCDR (bom);
10131           CHECK_CODING_SYSTEM (val);
10132         }
10133       ASET (attrs, coding_attr_utf_bom, bom);
10134
10135       category = (CONSP (bom) ? coding_category_utf_8_auto
10136                   : NILP (bom) ? coding_category_utf_8_nosig
10137                   : coding_category_utf_8_sig);
10138     }
10139   else if (EQ (coding_type, Qundecided))
10140     category = coding_category_undecided;
10141   else
10142     error ("Invalid coding system type: %s",
10143            SDATA (SYMBOL_NAME (coding_type)));
10144
10145   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10146   CODING_ATTR_PLIST (attrs)
10147     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10148                                 CODING_ATTR_PLIST (attrs)));
10149   CODING_ATTR_PLIST (attrs)
10150     = Fcons (QCascii_compatible_p,
10151              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10152                     CODING_ATTR_PLIST (attrs)));
10153
10154   eol_type = args[coding_arg_eol_type];
10155   if (! NILP (eol_type)
10156       && ! EQ (eol_type, Qunix)
10157       && ! EQ (eol_type, Qdos)
10158       && ! EQ (eol_type, Qmac))
10159     error ("Invalid eol-type");
10160
10161   aliases = Fcons (name, Qnil);
10162
10163   if (NILP (eol_type))
10164     {
10165       eol_type = make_subsidiaries (name);
10166       for (i = 0; i < 3; i++)
10167         {
10168           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10169
10170           this_name = AREF (eol_type, i);
10171           this_aliases = Fcons (this_name, Qnil);
10172           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10173           this_spec = Fmake_vector (make_number (3), attrs);
10174           ASET (this_spec, 1, this_aliases);
10175           ASET (this_spec, 2, this_eol_type);
10176           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10177           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10178           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10179           if (NILP (val))
10180             Vcoding_system_alist
10181               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10182                        Vcoding_system_alist);
10183         }
10184     }
10185
10186   spec_vec = Fmake_vector (make_number (3), attrs);
10187   ASET (spec_vec, 1, aliases);
10188   ASET (spec_vec, 2, eol_type);
10189
10190   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10191   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10192   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10193   if (NILP (val))
10194     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10195                                   Vcoding_system_alist);
10196
10197   {
10198     int id = coding_categories[category].id;
10199
10200     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10201       setup_coding_system (name, &coding_categories[category]);
10202   }
10203
10204   return Qnil;
10205
10206  short_args:
10207   return Fsignal (Qwrong_number_of_arguments,
10208                   Fcons (intern ("define-coding-system-internal"),
10209                          make_number (nargs)));
10210 }
10211
10212
10213 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10214        3, 3, 0,
10215        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10216   (coding_system, prop, val)
10217      Lisp_Object coding_system, prop, val;
10218 {
10219   Lisp_Object spec, attrs;
10220
10221   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10222   attrs = AREF (spec, 0);
10223   if (EQ (prop, QCmnemonic))
10224     {
10225       if (! STRINGP (val))
10226         CHECK_CHARACTER (val);
10227       CODING_ATTR_MNEMONIC (attrs) = val;
10228     }
10229   else if (EQ (prop, QCdefault_char))
10230     {
10231       if (NILP (val))
10232         val = make_number (' ');
10233       else
10234         CHECK_CHARACTER (val);
10235       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10236     }
10237   else if (EQ (prop, QCdecode_translation_table))
10238     {
10239       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10240         CHECK_SYMBOL (val);
10241       CODING_ATTR_DECODE_TBL (attrs) = val;
10242     }
10243   else if (EQ (prop, QCencode_translation_table))
10244     {
10245       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10246         CHECK_SYMBOL (val);
10247       CODING_ATTR_ENCODE_TBL (attrs) = val;
10248     }
10249   else if (EQ (prop, QCpost_read_conversion))
10250     {
10251       CHECK_SYMBOL (val);
10252       CODING_ATTR_POST_READ (attrs) = val;
10253     }
10254   else if (EQ (prop, QCpre_write_conversion))
10255     {
10256       CHECK_SYMBOL (val);
10257       CODING_ATTR_PRE_WRITE (attrs) = val;
10258     }
10259   else if (EQ (prop, QCascii_compatible_p))
10260     {
10261       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10262     }
10263
10264   CODING_ATTR_PLIST (attrs)
10265     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10266   return val;
10267 }
10268
10269
10270 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10271        Sdefine_coding_system_alias, 2, 2, 0,
10272        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10273      (alias, coding_system)
10274      Lisp_Object alias, coding_system;
10275 {
10276   Lisp_Object spec, aliases, eol_type, val;
10277
10278   CHECK_SYMBOL (alias);
10279   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10280   aliases = AREF (spec, 1);
10281   /* ALIASES should be a list of length more than zero, and the first
10282      element is a base coding system.  Append ALIAS at the tail of the
10283      list.  */
10284   while (!NILP (XCDR (aliases)))
10285     aliases = XCDR (aliases);
10286   XSETCDR (aliases, Fcons (alias, Qnil));
10287
10288   eol_type = AREF (spec, 2);
10289   if (VECTORP (eol_type))
10290     {
10291       Lisp_Object subsidiaries;
10292       int i;
10293
10294       subsidiaries = make_subsidiaries (alias);
10295       for (i = 0; i < 3; i++)
10296         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10297                                      AREF (eol_type, i));
10298     }
10299
10300   Fputhash (alias, spec, Vcoding_system_hash_table);
10301   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10302   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10303   if (NILP (val))
10304     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10305                                   Vcoding_system_alist);
10306
10307   return Qnil;
10308 }
10309
10310 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10311        1, 1, 0,
10312        doc: /* Return the base of CODING-SYSTEM.
10313 Any alias or subsidiary coding system is not a base coding system.  */)
10314   (coding_system)
10315      Lisp_Object coding_system;
10316 {
10317   Lisp_Object spec, attrs;
10318
10319   if (NILP (coding_system))
10320     return (Qno_conversion);
10321   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10322   attrs = AREF (spec, 0);
10323   return CODING_ATTR_BASE_NAME (attrs);
10324 }
10325
10326 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10327        1, 1, 0,
10328        doc: "Return the property list of CODING-SYSTEM.")
10329      (coding_system)
10330      Lisp_Object coding_system;
10331 {
10332   Lisp_Object spec, attrs;
10333
10334   if (NILP (coding_system))
10335     coding_system = Qno_conversion;
10336   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10337   attrs = AREF (spec, 0);
10338   return CODING_ATTR_PLIST (attrs);
10339 }
10340
10341
10342 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10343        1, 1, 0,
10344        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10345      (coding_system)
10346      Lisp_Object coding_system;
10347 {
10348   Lisp_Object spec;
10349
10350   if (NILP (coding_system))
10351     coding_system = Qno_conversion;
10352   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10353   return AREF (spec, 1);
10354 }
10355
10356 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10357        Scoding_system_eol_type, 1, 1, 0,
10358        doc: /* Return eol-type of CODING-SYSTEM.
10359 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10360
10361 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10362 and CR respectively.
10363
10364 A vector value indicates that a format of end-of-line should be
10365 detected automatically.  Nth element of the vector is the subsidiary
10366 coding system whose eol-type is N.  */)
10367      (coding_system)
10368      Lisp_Object coding_system;
10369 {
10370   Lisp_Object spec, eol_type;
10371   int n;
10372
10373   if (NILP (coding_system))
10374     coding_system = Qno_conversion;
10375   if (! CODING_SYSTEM_P (coding_system))
10376     return Qnil;
10377   spec = CODING_SYSTEM_SPEC (coding_system);
10378   eol_type = AREF (spec, 2);
10379   if (VECTORP (eol_type))
10380     return Fcopy_sequence (eol_type);
10381   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10382   return make_number (n);
10383 }
10384
10385 #endif /* emacs */
10386
10387 \f
10388 /*** 9. Post-amble ***/
10389
10390 void
10391 init_coding_once ()
10392 {
10393   int i;
10394
10395   for (i = 0; i < coding_category_max; i++)
10396     {
10397       coding_categories[i].id = -1;
10398       coding_priorities[i] = i;
10399     }
10400
10401   /* ISO2022 specific initialize routine.  */
10402   for (i = 0; i < 0x20; i++)
10403     iso_code_class[i] = ISO_control_0;
10404   for (i = 0x21; i < 0x7F; i++)
10405     iso_code_class[i] = ISO_graphic_plane_0;
10406   for (i = 0x80; i < 0xA0; i++)
10407     iso_code_class[i] = ISO_control_1;
10408   for (i = 0xA1; i < 0xFF; i++)
10409     iso_code_class[i] = ISO_graphic_plane_1;
10410   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10411   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10412   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10413   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10414   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10415   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10416   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10417   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10418   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10419
10420   for (i = 0; i < 256; i++)
10421     {
10422       emacs_mule_bytes[i] = 1;
10423     }
10424   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10425   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10426   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10427   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10428 }
10429
10430 #ifdef emacs
10431
10432 void
10433 syms_of_coding ()
10434 {
10435   staticpro (&Vcoding_system_hash_table);
10436   {
10437     Lisp_Object args[2];
10438     args[0] = QCtest;
10439     args[1] = Qeq;
10440     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10441   }
10442
10443   staticpro (&Vsjis_coding_system);
10444   Vsjis_coding_system = Qnil;
10445
10446   staticpro (&Vbig5_coding_system);
10447   Vbig5_coding_system = Qnil;
10448
10449   staticpro (&Vcode_conversion_reused_workbuf);
10450   Vcode_conversion_reused_workbuf = Qnil;
10451
10452   staticpro (&Vcode_conversion_workbuf_name);
10453   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10454
10455   reused_workbuf_in_use = 0;
10456
10457   DEFSYM (Qcharset, "charset");
10458   DEFSYM (Qtarget_idx, "target-idx");
10459   DEFSYM (Qcoding_system_history, "coding-system-history");
10460   Fset (Qcoding_system_history, Qnil);
10461
10462   /* Target FILENAME is the first argument.  */
10463   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10464   /* Target FILENAME is the third argument.  */
10465   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10466
10467   DEFSYM (Qcall_process, "call-process");
10468   /* Target PROGRAM is the first argument.  */
10469   Fput (Qcall_process, Qtarget_idx, make_number (0));
10470
10471   DEFSYM (Qcall_process_region, "call-process-region");
10472   /* Target PROGRAM is the third argument.  */
10473   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10474
10475   DEFSYM (Qstart_process, "start-process");
10476   /* Target PROGRAM is the third argument.  */
10477   Fput (Qstart_process, Qtarget_idx, make_number (2));
10478
10479   DEFSYM (Qopen_network_stream, "open-network-stream");
10480   /* Target SERVICE is the fourth argument.  */
10481   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10482
10483   DEFSYM (Qcoding_system, "coding-system");
10484   DEFSYM (Qcoding_aliases, "coding-aliases");
10485
10486   DEFSYM (Qeol_type, "eol-type");
10487   DEFSYM (Qunix, "unix");
10488   DEFSYM (Qdos, "dos");
10489
10490   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10491   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10492   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10493   DEFSYM (Qdefault_char, "default-char");
10494   DEFSYM (Qundecided, "undecided");
10495   DEFSYM (Qno_conversion, "no-conversion");
10496   DEFSYM (Qraw_text, "raw-text");
10497
10498   DEFSYM (Qiso_2022, "iso-2022");
10499
10500   DEFSYM (Qutf_8, "utf-8");
10501   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10502
10503   DEFSYM (Qutf_16, "utf-16");
10504   DEFSYM (Qbig, "big");
10505   DEFSYM (Qlittle, "little");
10506
10507   DEFSYM (Qshift_jis, "shift-jis");
10508   DEFSYM (Qbig5, "big5");
10509
10510   DEFSYM (Qcoding_system_p, "coding-system-p");
10511
10512   DEFSYM (Qcoding_system_error, "coding-system-error");
10513   Fput (Qcoding_system_error, Qerror_conditions,
10514         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10515   Fput (Qcoding_system_error, Qerror_message,
10516         make_pure_c_string ("Invalid coding system"));
10517
10518   /* Intern this now in case it isn't already done.
10519      Setting this variable twice is harmless.
10520      But don't staticpro it here--that is done in alloc.c.  */
10521   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10522
10523   DEFSYM (Qtranslation_table, "translation-table");
10524   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10525   DEFSYM (Qtranslation_table_id, "translation-table-id");
10526   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10527   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10528
10529   DEFSYM (Qvalid_codes, "valid-codes");
10530
10531   DEFSYM (Qemacs_mule, "emacs-mule");
10532
10533   DEFSYM (QCcategory, ":category");
10534   DEFSYM (QCmnemonic, ":mnemonic");
10535   DEFSYM (QCdefault_char, ":default-char");
10536   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10537   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10538   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10539   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10540   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10541
10542   Vcoding_category_table
10543     = Fmake_vector (make_number (coding_category_max), Qnil);
10544   staticpro (&Vcoding_category_table);
10545   /* Followings are target of code detection.  */
10546   ASET (Vcoding_category_table, coding_category_iso_7,
10547         intern_c_string ("coding-category-iso-7"));
10548   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10549         intern_c_string ("coding-category-iso-7-tight"));
10550   ASET (Vcoding_category_table, coding_category_iso_8_1,
10551         intern_c_string ("coding-category-iso-8-1"));
10552   ASET (Vcoding_category_table, coding_category_iso_8_2,
10553         intern_c_string ("coding-category-iso-8-2"));
10554   ASET (Vcoding_category_table, coding_category_iso_7_else,
10555         intern_c_string ("coding-category-iso-7-else"));
10556   ASET (Vcoding_category_table, coding_category_iso_8_else,
10557         intern_c_string ("coding-category-iso-8-else"));
10558   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10559         intern_c_string ("coding-category-utf-8-auto"));
10560   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10561         intern_c_string ("coding-category-utf-8"));
10562   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10563         intern_c_string ("coding-category-utf-8-sig"));
10564   ASET (Vcoding_category_table, coding_category_utf_16_be,
10565         intern_c_string ("coding-category-utf-16-be"));
10566   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10567         intern_c_string ("coding-category-utf-16-auto"));
10568   ASET (Vcoding_category_table, coding_category_utf_16_le,
10569         intern_c_string ("coding-category-utf-16-le"));
10570   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10571         intern_c_string ("coding-category-utf-16-be-nosig"));
10572   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10573         intern_c_string ("coding-category-utf-16-le-nosig"));
10574   ASET (Vcoding_category_table, coding_category_charset,
10575         intern_c_string ("coding-category-charset"));
10576   ASET (Vcoding_category_table, coding_category_sjis,
10577         intern_c_string ("coding-category-sjis"));
10578   ASET (Vcoding_category_table, coding_category_big5,
10579         intern_c_string ("coding-category-big5"));
10580   ASET (Vcoding_category_table, coding_category_ccl,
10581         intern_c_string ("coding-category-ccl"));
10582   ASET (Vcoding_category_table, coding_category_emacs_mule,
10583         intern_c_string ("coding-category-emacs-mule"));
10584   /* Followings are NOT target of code detection.  */
10585   ASET (Vcoding_category_table, coding_category_raw_text,
10586         intern_c_string ("coding-category-raw-text"));
10587   ASET (Vcoding_category_table, coding_category_undecided,
10588         intern_c_string ("coding-category-undecided"));
10589
10590   DEFSYM (Qinsufficient_source, "insufficient-source");
10591   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10592   DEFSYM (Qinvalid_source, "invalid-source");
10593   DEFSYM (Qinterrupted, "interrupted");
10594   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10595   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10596
10597   defsubr (&Scoding_system_p);
10598   defsubr (&Sread_coding_system);
10599   defsubr (&Sread_non_nil_coding_system);
10600   defsubr (&Scheck_coding_system);
10601   defsubr (&Sdetect_coding_region);
10602   defsubr (&Sdetect_coding_string);
10603   defsubr (&Sfind_coding_systems_region_internal);
10604   defsubr (&Sunencodable_char_position);
10605   defsubr (&Scheck_coding_systems_region);
10606   defsubr (&Sdecode_coding_region);
10607   defsubr (&Sencode_coding_region);
10608   defsubr (&Sdecode_coding_string);
10609   defsubr (&Sencode_coding_string);
10610   defsubr (&Sdecode_sjis_char);
10611   defsubr (&Sencode_sjis_char);
10612   defsubr (&Sdecode_big5_char);
10613   defsubr (&Sencode_big5_char);
10614   defsubr (&Sset_terminal_coding_system_internal);
10615   defsubr (&Sset_safe_terminal_coding_system_internal);
10616   defsubr (&Sterminal_coding_system);
10617   defsubr (&Sset_keyboard_coding_system_internal);
10618   defsubr (&Skeyboard_coding_system);
10619   defsubr (&Sfind_operation_coding_system);
10620   defsubr (&Sset_coding_system_priority);
10621   defsubr (&Sdefine_coding_system_internal);
10622   defsubr (&Sdefine_coding_system_alias);
10623   defsubr (&Scoding_system_put);
10624   defsubr (&Scoding_system_base);
10625   defsubr (&Scoding_system_plist);
10626   defsubr (&Scoding_system_aliases);
10627   defsubr (&Scoding_system_eol_type);
10628   defsubr (&Scoding_system_priority_list);
10629
10630   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10631                doc: /* List of coding systems.
10632
10633 Do not alter the value of this variable manually.  This variable should be
10634 updated by the functions `define-coding-system' and
10635 `define-coding-system-alias'.  */);
10636   Vcoding_system_list = Qnil;
10637
10638   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10639                doc: /* Alist of coding system names.
10640 Each element is one element list of coding system name.
10641 This variable is given to `completing-read' as COLLECTION argument.
10642
10643 Do not alter the value of this variable manually.  This variable should be
10644 updated by the functions `make-coding-system' and
10645 `define-coding-system-alias'.  */);
10646   Vcoding_system_alist = Qnil;
10647
10648   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10649                doc: /* List of coding-categories (symbols) ordered by priority.
10650
10651 On detecting a coding system, Emacs tries code detection algorithms
10652 associated with each coding-category one by one in this order.  When
10653 one algorithm agrees with a byte sequence of source text, the coding
10654 system bound to the corresponding coding-category is selected.
10655
10656 Don't modify this variable directly, but use `set-coding-priority'.  */);
10657   {
10658     int i;
10659
10660     Vcoding_category_list = Qnil;
10661     for (i = coding_category_max - 1; i >= 0; i--)
10662       Vcoding_category_list
10663         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10664                  Vcoding_category_list);
10665   }
10666
10667   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10668                doc: /* Specify the coding system for read operations.
10669 It is useful to bind this variable with `let', but do not set it globally.
10670 If the value is a coding system, it is used for decoding on read operation.
10671 If not, an appropriate element is used from one of the coding system alists.
10672 There are three such tables: `file-coding-system-alist',
10673 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10674   Vcoding_system_for_read = Qnil;
10675
10676   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10677                doc: /* Specify the coding system for write operations.
10678 Programs bind this variable with `let', but you should not set it globally.
10679 If the value is a coding system, it is used for encoding of output,
10680 when writing it to a file and when sending it to a file or subprocess.
10681
10682 If this does not specify a coding system, an appropriate element
10683 is used from one of the coding system alists.
10684 There are three such tables: `file-coding-system-alist',
10685 `process-coding-system-alist', and `network-coding-system-alist'.
10686 For output to files, if the above procedure does not specify a coding system,
10687 the value of `buffer-file-coding-system' is used.  */);
10688   Vcoding_system_for_write = Qnil;
10689
10690   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10691                doc: /*
10692 Coding system used in the latest file or process I/O.  */);
10693   Vlast_coding_system_used = Qnil;
10694
10695   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10696                doc: /*
10697 Error status of the last code conversion.
10698
10699 When an error was detected in the last code conversion, this variable
10700 is set to one of the following symbols.
10701   `insufficient-source'
10702   `inconsistent-eol'
10703   `invalid-source'
10704   `interrupted'
10705   `insufficient-memory'
10706 When no error was detected, the value doesn't change.  So, to check
10707 the error status of a code conversion by this variable, you must
10708 explicitly set this variable to nil before performing code
10709 conversion.  */);
10710   Vlast_code_conversion_error = Qnil;
10711
10712   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10713                doc: /*
10714 *Non-nil means always inhibit code conversion of end-of-line format.
10715 See info node `Coding Systems' and info node `Text and Binary' concerning
10716 such conversion.  */);
10717   inhibit_eol_conversion = 0;
10718
10719   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10720                doc: /*
10721 Non-nil means process buffer inherits coding system of process output.
10722 Bind it to t if the process output is to be treated as if it were a file
10723 read from some filesystem.  */);
10724   inherit_process_coding_system = 0;
10725
10726   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10727                doc: /*
10728 Alist to decide a coding system to use for a file I/O operation.
10729 The format is ((PATTERN . VAL) ...),
10730 where PATTERN is a regular expression matching a file name,
10731 VAL is a coding system, a cons of coding systems, or a function symbol.
10732 If VAL is a coding system, it is used for both decoding and encoding
10733 the file contents.
10734 If VAL is a cons of coding systems, the car part is used for decoding,
10735 and the cdr part is used for encoding.
10736 If VAL is a function symbol, the function must return a coding system
10737 or a cons of coding systems which are used as above.  The function is
10738 called with an argument that is a list of the arguments with which
10739 `find-operation-coding-system' was called.  If the function can't decide
10740 a coding system, it can return `undecided' so that the normal
10741 code-detection is performed.
10742
10743 See also the function `find-operation-coding-system'
10744 and the variable `auto-coding-alist'.  */);
10745   Vfile_coding_system_alist = Qnil;
10746
10747   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10748                doc: /*
10749 Alist to decide a coding system to use for a process I/O operation.
10750 The format is ((PATTERN . VAL) ...),
10751 where PATTERN is a regular expression matching a program name,
10752 VAL is a coding system, a cons of coding systems, or a function symbol.
10753 If VAL is a coding system, it is used for both decoding what received
10754 from the program and encoding what sent to the program.
10755 If VAL is a cons of coding systems, the car part is used for decoding,
10756 and the cdr part is used for encoding.
10757 If VAL is a function symbol, the function must return a coding system
10758 or a cons of coding systems which are used as above.
10759
10760 See also the function `find-operation-coding-system'.  */);
10761   Vprocess_coding_system_alist = Qnil;
10762
10763   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10764                doc: /*
10765 Alist to decide a coding system to use for a network I/O operation.
10766 The format is ((PATTERN . VAL) ...),
10767 where PATTERN is a regular expression matching a network service name
10768 or is a port number to connect to,
10769 VAL is a coding system, a cons of coding systems, or a function symbol.
10770 If VAL is a coding system, it is used for both decoding what received
10771 from the network stream and encoding what sent to the network stream.
10772 If VAL is a cons of coding systems, the car part is used for decoding,
10773 and the cdr part is used for encoding.
10774 If VAL is a function symbol, the function must return a coding system
10775 or a cons of coding systems which are used as above.
10776
10777 See also the function `find-operation-coding-system'.  */);
10778   Vnetwork_coding_system_alist = Qnil;
10779
10780   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10781                doc: /* Coding system to use with system messages.
10782 Also used for decoding keyboard input on X Window system.  */);
10783   Vlocale_coding_system = Qnil;
10784
10785   /* The eol mnemonics are reset in startup.el system-dependently.  */
10786   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10787                doc: /*
10788 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10789   eol_mnemonic_unix = make_pure_c_string (":");
10790
10791   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10792                doc: /*
10793 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10794   eol_mnemonic_dos = make_pure_c_string ("\\");
10795
10796   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10797                doc: /*
10798 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10799   eol_mnemonic_mac = make_pure_c_string ("/");
10800
10801   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10802                doc: /*
10803 *String displayed in mode line when end-of-line format is not yet determined.  */);
10804   eol_mnemonic_undecided = make_pure_c_string (":");
10805
10806   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10807                doc: /*
10808 *Non-nil enables character translation while encoding and decoding.  */);
10809   Venable_character_translation = Qt;
10810
10811   DEFVAR_LISP ("standard-translation-table-for-decode",
10812                &Vstandard_translation_table_for_decode,
10813                doc: /* Table for translating characters while decoding.  */);
10814   Vstandard_translation_table_for_decode = Qnil;
10815
10816   DEFVAR_LISP ("standard-translation-table-for-encode",
10817                &Vstandard_translation_table_for_encode,
10818                doc: /* Table for translating characters while encoding.  */);
10819   Vstandard_translation_table_for_encode = Qnil;
10820
10821   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10822                doc: /* Alist of charsets vs revision numbers.
10823 While encoding, if a charset (car part of an element) is found,
10824 designate it with the escape sequence identifying revision (cdr part
10825 of the element).  */);
10826   Vcharset_revision_table = Qnil;
10827
10828   DEFVAR_LISP ("default-process-coding-system",
10829                &Vdefault_process_coding_system,
10830                doc: /* Cons of coding systems used for process I/O by default.
10831 The car part is used for decoding a process output,
10832 the cdr part is used for encoding a text to be sent to a process.  */);
10833   Vdefault_process_coding_system = Qnil;
10834
10835   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10836                doc: /*
10837 Table of extra Latin codes in the range 128..159 (inclusive).
10838 This is a vector of length 256.
10839 If Nth element is non-nil, the existence of code N in a file
10840 \(or output of subprocess) doesn't prevent it to be detected as
10841 a coding system of ISO 2022 variant which has a flag
10842 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10843 or reading output of a subprocess.
10844 Only 128th through 159th elements have a meaning.  */);
10845   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10846
10847   DEFVAR_LISP ("select-safe-coding-system-function",
10848                &Vselect_safe_coding_system_function,
10849                doc: /*
10850 Function to call to select safe coding system for encoding a text.
10851
10852 If set, this function is called to force a user to select a proper
10853 coding system which can encode the text in the case that a default
10854 coding system used in each operation can't encode the text.  The
10855 function should take care that the buffer is not modified while
10856 the coding system is being selected.
10857
10858 The default value is `select-safe-coding-system' (which see).  */);
10859   Vselect_safe_coding_system_function = Qnil;
10860
10861   DEFVAR_BOOL ("coding-system-require-warning",
10862                &coding_system_require_warning,
10863                doc: /* Internal use only.
10864 If non-nil, on writing a file, `select-safe-coding-system-function' is
10865 called even if `coding-system-for-write' is non-nil.  The command
10866 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10867   coding_system_require_warning = 0;
10868
10869
10870   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10871                &inhibit_iso_escape_detection,
10872                doc: /*
10873 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10874
10875 When Emacs reads text, it tries to detect how the text is encoded.
10876 This code detection is sensitive to escape sequences.  If Emacs sees
10877 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10878 of the ISO2022 encodings, and decodes text by the corresponding coding
10879 system (e.g. `iso-2022-7bit').
10880
10881 However, there may be a case that you want to read escape sequences in
10882 a file as is.  In such a case, you can set this variable to non-nil.
10883 Then the code detection will ignore any escape sequences, and no text is
10884 detected as encoded in some ISO-2022 encoding.  The result is that all
10885 escape sequences become visible in a buffer.
10886
10887 The default value is nil, and it is strongly recommended not to change
10888 it.  That is because many Emacs Lisp source files that contain
10889 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10890 in Emacs's distribution, and they won't be decoded correctly on
10891 reading if you suppress escape sequence detection.
10892
10893 The other way to read escape sequences in a file without decoding is
10894 to explicitly specify some coding system that doesn't use ISO-2022
10895 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10896   inhibit_iso_escape_detection = 0;
10897
10898   DEFVAR_BOOL ("inhibit-null-byte-detection",
10899                &inhibit_null_byte_detection,
10900                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10901 By default, Emacs treats it as binary data, and does not attempt to
10902 decode it.  The effect is as if you specified `no-conversion' for
10903 reading that text.
10904
10905 Set this to non-nil when a regular text happens to include null bytes.
10906 Examples are Index nodes of Info files and null-byte delimited output
10907 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10908 decode text as usual.  */);
10909   inhibit_null_byte_detection = 0;
10910
10911   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10912                doc: /* Char table for translating self-inserting characters.
10913 This is applied to the result of input methods, not their input.
10914 See also `keyboard-translate-table'.
10915
10916 Use of this variable for character code unification was rendered
10917 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10918 internal character representation.  */);
10919     Vtranslation_table_for_input = Qnil;
10920
10921   {
10922     Lisp_Object args[coding_arg_max];
10923     Lisp_Object plist[16];
10924     int i;
10925
10926     for (i = 0; i < coding_arg_max; i++)
10927       args[i] = Qnil;
10928
10929     plist[0] = intern_c_string (":name");
10930     plist[1] = args[coding_arg_name] = Qno_conversion;
10931     plist[2] = intern_c_string (":mnemonic");
10932     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10933     plist[4] = intern_c_string (":coding-type");
10934     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10935     plist[6] = intern_c_string (":ascii-compatible-p");
10936     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10937     plist[8] = intern_c_string (":default-char");
10938     plist[9] = args[coding_arg_default_char] = make_number (0);
10939     plist[10] = intern_c_string (":for-unibyte");
10940     plist[11] = args[coding_arg_for_unibyte] = Qt;
10941     plist[12] = intern_c_string (":docstring");
10942     plist[13] = make_pure_c_string ("Do no conversion.\n\
10943 \n\
10944 When you visit a file with this coding, the file is read into a\n\
10945 unibyte buffer as is, thus each byte of a file is treated as a\n\
10946 character.");
10947     plist[14] = intern_c_string (":eol-type");
10948     plist[15] = args[coding_arg_eol_type] = Qunix;
10949     args[coding_arg_plist] = Flist (16, plist);
10950     Fdefine_coding_system_internal (coding_arg_max, args);
10951
10952     plist[1] = args[coding_arg_name] = Qundecided;
10953     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10954     plist[5] = args[coding_arg_coding_type] = Qundecided;
10955     /* This is already set.
10956        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10957     plist[8] = intern_c_string (":charset-list");
10958     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10959     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10960     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10961     plist[15] = args[coding_arg_eol_type] = Qnil;
10962     args[coding_arg_plist] = Flist (16, plist);
10963     Fdefine_coding_system_internal (coding_arg_max, args);
10964   }
10965
10966   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10967
10968   {
10969     int i;
10970
10971     for (i = 0; i < coding_category_max; i++)
10972       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10973   }
10974 #if defined (MSDOS) || defined (WINDOWSNT)
10975   system_eol_type = Qdos;
10976 #else
10977   system_eol_type = Qunix;
10978 #endif
10979   staticpro (&system_eol_type);
10980 }
10981
10982 char *
10983 emacs_strerror (error_number)
10984      int error_number;
10985 {
10986   char *str;
10987
10988   synchronize_system_messages_locale ();
10989   str = strerror (error_number);
10990
10991   if (! NILP (Vlocale_coding_system))
10992     {
10993       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10994                                                       Vlocale_coding_system,
10995                                                       0);
10996       str = (char *) SDATA (dec);
10997     }
10998
10999   return str;
11000 }
11001
11002 #endif /* emacs */
11003
11004 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
11005    (do not change this comment) */