src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292 #include <setjmp.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302 #include "frame.h"
 303 #include "termhooks.h"
 304
 305 Lisp_Object Vcoding_system_hash_table;
 306
 307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 308 Lisp_Object Qunix, Qdos;
 309 extern Lisp_Object Qmac;        /* frame.c */
 310 Lisp_Object Qbuffer_file_coding_system;
 311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 315 Lisp_Object Qbig, Qlittle;
 316 Lisp_Object Qcoding_system_history;
 317 Lisp_Object Qvalid_codes;
 318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 321 Lisp_Object QCascii_compatible_p;
 322
 323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 324 Lisp_Object Qcall_process, Qcall_process_region;
 325 Lisp_Object Qstart_process, Qopen_network_stream;
 326 Lisp_Object Qtarget_idx;
 327
 328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 329 Lisp_Object Qinterrupted, Qinsufficient_memory;
 330
 331 extern Lisp_Object Qcompletion_ignore_case;
 332
 333 /* If a symbol has this property, evaluate the value to define the
 334    symbol as a coding system.  */
 335 static Lisp_Object Qcoding_system_define_form;
 336
 337 int coding_system_require_warning;
 338
 339 Lisp_Object Vselect_safe_coding_system_function;
 340
 341 /* Mnemonic string for each format of end-of-line.  */
 342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 343 /* Mnemonic string to indicate format of end-of-line is not yet
 344    decided.  */
 345 Lisp_Object eol_mnemonic_undecided;
 346
 347 /* Format of end-of-line decided by system.  This is Qunix on
 348    Unix and Mac, Qdos on DOS/Windows.
 349    This has an effect only for external encoding (i.e. for output to
 350    file and process), not for in-buffer or Lisp string encoding.  */
 351 static Lisp_Object system_eol_type;
 352
 353 #ifdef emacs
 354
 355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 356
 357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 358
 359 /* Coding system emacs-mule and raw-text are for converting only
 360    end-of-line format.  */
 361 Lisp_Object Qemacs_mule, Qraw_text;
 362 Lisp_Object Qutf_8_emacs;
 363
 364 /* Coding-systems are handed between Emacs Lisp programs and C internal
 365    routines by the following three variables.  */
 366 /* Coding-system for reading files and receiving data from process.  */
 367 Lisp_Object Vcoding_system_for_read;
 368 /* Coding-system for writing files and sending data to process.  */
 369 Lisp_Object Vcoding_system_for_write;
 370 /* Coding-system actually used in the latest I/O.  */
 371 Lisp_Object Vlast_coding_system_used;
 372 /* Set to non-nil when an error is detected while code conversion.  */
 373 Lisp_Object Vlast_code_conversion_error;
 374 /* A vector of length 256 which contains information about special
 375    Latin codes (especially for dealing with Microsoft codes).  */
 376 Lisp_Object Vlatin_extra_code_table;
 377
 378 /* Flag to inhibit code conversion of end-of-line format.  */
 379 int inhibit_eol_conversion;
 380
 381 /* Flag to inhibit ISO2022 escape sequence detection.  */
 382 int inhibit_iso_escape_detection;
 383
 384 /* Flag to inhibit detection of binary files through null bytes.  */
 385 int inhibit_null_byte_detection;
 386
 387 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 388 int inherit_process_coding_system;
 389
 390 /* Coding system to be used to encode text for terminal display when
 391    terminal coding system is nil.  */
 392 struct coding_system safe_terminal_coding;
 393
 394 Lisp_Object Vfile_coding_system_alist;
 395 Lisp_Object Vprocess_coding_system_alist;
 396 Lisp_Object Vnetwork_coding_system_alist;
 397
 398 Lisp_Object Vlocale_coding_system;
 399
 400 #endif /* emacs */
 401
 402 /* Flag to tell if we look up translation table on character code
 403    conversion.  */
 404 Lisp_Object Venable_character_translation;
 405 /* Standard translation table to look up on decoding (reading).  */
 406 Lisp_Object Vstandard_translation_table_for_decode;
 407 /* Standard translation table to look up on encoding (writing).  */
 408 Lisp_Object Vstandard_translation_table_for_encode;
 409
 410 Lisp_Object Qtranslation_table;
 411 Lisp_Object Qtranslation_table_id;
 412 Lisp_Object Qtranslation_table_for_decode;
 413 Lisp_Object Qtranslation_table_for_encode;
 414
 415 /* Alist of charsets vs revision number.  */
 416 static Lisp_Object Vcharset_revision_table;
 417
 418 /* Default coding systems used for process I/O.  */
 419 Lisp_Object Vdefault_process_coding_system;
 420
 421 /* Char table for translating Quail and self-inserting input.  */
 422 Lisp_Object Vtranslation_table_for_input;
 423
 424 /* Two special coding systems.  */
 425 Lisp_Object Vsjis_coding_system;
 426 Lisp_Object Vbig5_coding_system;
 427
 428 /* ISO2022 section */
 429
 430 #define CODING_ISO_INITIAL(coding, reg)                 \
 431   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 432                      coding_attr_iso_initial),          \
 433                reg)))
 434
 435
 436 #define CODING_ISO_REQUEST(coding, charset_id)          \
 437   (((charset_id) <= (coding)->max_charset_id            \
 438     ? ((coding)->safe_charsets[charset_id] != 255       \
 439        ? (coding)->safe_charsets[charset_id]            \
 440        : -1)                                            \
 441     : -1))
 442
 443
 444 #define CODING_ISO_FLAGS(coding)        \
 445   ((coding)->spec.iso_2022.flags)
 446 #define CODING_ISO_DESIGNATION(coding, reg)     \
 447   ((coding)->spec.iso_2022.current_designation[reg])
 448 #define CODING_ISO_INVOCATION(coding, plane)    \
 449   ((coding)->spec.iso_2022.current_invocation[plane])
 450 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 451   ((coding)->spec.iso_2022.single_shifting)
 452 #define CODING_ISO_BOL(coding)  \
 453   ((coding)->spec.iso_2022.bol)
 454 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 455   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 456 #define CODING_ISO_CMP_STATUS(coding)   \
 457   (&(coding)->spec.iso_2022.cmp_status)
 458 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 459   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 460 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 461   ((coding)->spec.iso_2022.embedded_utf_8)
 462
 463 /* Control characters of ISO2022.  */
 464                         /* code */      /* function */
 465 #define ISO_CODE_LF     0x0A            /* line-feed */
 466 #define ISO_CODE_CR     0x0D            /* carriage-return */
 467 #define ISO_CODE_SO     0x0E            /* shift-out */
 468 #define ISO_CODE_SI     0x0F            /* shift-in */
 469 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 470 #define ISO_CODE_ESC    0x1B            /* escape */
 471 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 472 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 473 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 474
 475 /* All code (1-byte) of ISO2022 is classified into one of the
 476    followings.  */
 477 enum iso_code_class_type
 478   {
 479     ISO_control_0,              /* Control codes in the range
 480                                    0x00..0x1F and 0x7F, except for the
 481                                    following 5 codes.  */
 482     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 483     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 484     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 485     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 486     ISO_control_1,              /* Control codes in the range
 487                                    0x80..0x9F, except for the
 488                                    following 3 codes.  */
 489     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 490     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 491     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 492     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 493     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 494     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 495     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 496   };
 497
 498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 499     `iso-flags' attribute of an iso2022 coding system.  */
 500
 501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 502    instead of the correct short-form sequence (e.g. ESC $ A).  */
 503 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 504
 505 /* If set, reset graphic planes and registers at end-of-line to the
 506    initial state.  */
 507 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 508
 509 /* If set, reset graphic planes and registers before any control
 510    characters to the initial state.  */
 511 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 512
 513 /* If set, encode by 7-bit environment.  */
 514 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 515
 516 /* If set, use locking-shift function.  */
 517 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 518
 519 /* If set, use single-shift function.  Overwrite
 520    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 521 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 522
 523 /* If set, use designation escape sequence.  */
 524 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 525
 526 /* If set, produce revision number sequence.  */
 527 #define CODING_ISO_FLAG_REVISION        0x0080
 528
 529 /* If set, produce ISO6429's direction specifying sequence.  */
 530 #define CODING_ISO_FLAG_DIRECTION       0x0100
 531
 532 /* If set, assume designation states are reset at beginning of line on
 533    output.  */
 534 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 535
 536 /* If set, designation sequence should be placed at beginning of line
 537    on output.  */
 538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 539
 540 /* If set, do not encode unsafe charactes on output.  */
 541 #define CODING_ISO_FLAG_SAFE            0x0800
 542
 543 /* If set, extra latin codes (128..159) are accepted as a valid code
 544    on input.  */
 545 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 546
 547 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 548
 549 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 550
 551 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 552
 553 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 554
 555 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 556
 557 /* A character to be produced on output if encoding of the original
 558    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 560
 561 /* UTF-8 section */
 562 #define CODING_UTF_8_BOM(coding)        \
 563   ((coding)->spec.utf_8_bom)
 564
 565 /* UTF-16 section */
 566 #define CODING_UTF_16_BOM(coding)       \
 567   ((coding)->spec.utf_16.bom)
 568
 569 #define CODING_UTF_16_ENDIAN(coding)    \
 570   ((coding)->spec.utf_16.endian)
 571
 572 #define CODING_UTF_16_SURROGATE(coding) \
 573   ((coding)->spec.utf_16.surrogate)
 574
 575
 576 /* CCL section */
 577 #define CODING_CCL_DECODER(coding)      \
 578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 579 #define CODING_CCL_ENCODER(coding)      \
 580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 581 #define CODING_CCL_VALIDS(coding)                                          \
 582   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 583
 584 /* Index for each coding category in `coding_categories' */
 585
 586 enum coding_category
 587   {
 588     coding_category_iso_7,
 589     coding_category_iso_7_tight,
 590     coding_category_iso_8_1,
 591     coding_category_iso_8_2,
 592     coding_category_iso_7_else,
 593     coding_category_iso_8_else,
 594     coding_category_utf_8_auto,
 595     coding_category_utf_8_nosig,
 596     coding_category_utf_8_sig,
 597     coding_category_utf_16_auto,
 598     coding_category_utf_16_be,
 599     coding_category_utf_16_le,
 600     coding_category_utf_16_be_nosig,
 601     coding_category_utf_16_le_nosig,
 602     coding_category_charset,
 603     coding_category_sjis,
 604     coding_category_big5,
 605     coding_category_ccl,
 606     coding_category_emacs_mule,
 607     /* All above are targets of code detection.  */
 608     coding_category_raw_text,
 609     coding_category_undecided,
 610     coding_category_max
 611   };
 612
 613 /* Definitions of flag bits used in detect_coding_XXXX.  */
 614 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 615 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 616 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 617 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 618 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 619 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 620 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 621 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 622 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 623 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8_AUTO           \
 645    | CATEGORY_MASK_UTF_8_NOSIG          \
 646    | CATEGORY_MASK_UTF_8_SIG            \
 647    | CATEGORY_MASK_UTF_16_AUTO          \
 648    | CATEGORY_MASK_UTF_16_BE            \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 652    | CATEGORY_MASK_CHARSET              \
 653    | CATEGORY_MASK_SJIS                 \
 654    | CATEGORY_MASK_BIG5                 \
 655    | CATEGORY_MASK_CCL                  \
 656    | CATEGORY_MASK_EMACS_MULE)
 657
 658
 659 #define CATEGORY_MASK_ISO_7BIT \
 660   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 661
 662 #define CATEGORY_MASK_ISO_8BIT \
 663   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 664
 665 #define CATEGORY_MASK_ISO_ELSE \
 666   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 667
 668 #define CATEGORY_MASK_ISO_ESCAPE        \
 669   (CATEGORY_MASK_ISO_7                  \
 670    | CATEGORY_MASK_ISO_7_TIGHT          \
 671    | CATEGORY_MASK_ISO_7_ELSE           \
 672    | CATEGORY_MASK_ISO_8_ELSE)
 673
 674 #define CATEGORY_MASK_ISO       \
 675   (  CATEGORY_MASK_ISO_7BIT     \
 676      | CATEGORY_MASK_ISO_8BIT   \
 677      | CATEGORY_MASK_ISO_ELSE)
 678
 679 #define CATEGORY_MASK_UTF_16            \
 680   (CATEGORY_MASK_UTF_16_AUTO            \
 681    | CATEGORY_MASK_UTF_16_BE            \
 682    | CATEGORY_MASK_UTF_16_LE            \
 683    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 684    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 685
 686 #define CATEGORY_MASK_UTF_8     \
 687   (CATEGORY_MASK_UTF_8_AUTO     \
 688    | CATEGORY_MASK_UTF_8_NOSIG  \
 689    | CATEGORY_MASK_UTF_8_SIG)
 690
 691 /* List of symbols `coding-category-xxx' ordered by priority.  This
 692    variable is exposed to Emacs Lisp.  */
 693 static Lisp_Object Vcoding_category_list;
 694
 695 /* Table of coding categories (Lisp symbols).  This variable is for
 696    internal use oly.  */
 697 static Lisp_Object Vcoding_category_table;
 698
 699 /* Table of coding-categories ordered by priority.  */
 700 static enum coding_category coding_priorities[coding_category_max];
 701
 702 /* Nth element is a coding context for the coding system bound to the
 703    Nth coding category.  */
 704 static struct coding_system coding_categories[coding_category_max];
 705
 706 /*** Commonly used macros and functions ***/
 707
 708 #ifndef min
 709 #define min(a, b) ((a) < (b) ? (a) : (b))
 710 #endif
 711 #ifndef max
 712 #define max(a, b) ((a) > (b) ? (a) : (b))
 713 #endif
 714
 715 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 716   do {                                                  \
 717     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 718     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 719   } while (0)
 720
 721
 722 /* Safely get one byte from the source text pointed by SRC which ends
 723    at SRC_END, and set C to that byte.  If there are not enough bytes
 724    in the source, it jumps to `no_more_source'.  If multibytep is
 725    nonzero, and a multibyte character is found at SRC, set C to the
 726    negative value of the character code.  The caller should declare
 727    and set these variables appropriately in advance:
 728         src, src_end, multibytep */
 729
 730 #define ONE_MORE_BYTE(c)                                \
 731   do {                                                  \
 732     if (src == src_end)                                 \
 733       {                                                 \
 734         if (src_base < src)                             \
 735           record_conversion_result                      \
 736             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 737         goto no_more_source;                            \
 738       }                                                 \
 739     c = *src++;                                         \
 740     if (multibytep && (c & 0x80))                       \
 741       {                                                 \
 742         if ((c & 0xFE) == 0xC0)                         \
 743           c = ((c & 1) << 6) | *src++;                  \
 744         else                                            \
 745           {                                             \
 746             src--;                                      \
 747             c = - string_char (src, &src, NULL);        \
 748             record_conversion_result                    \
 749               (coding, CODING_RESULT_INVALID_SRC);      \
 750           }                                             \
 751       }                                                 \
 752     consumed_chars++;                                   \
 753   } while (0)
 754
 755 /* Safely get two bytes from the source text pointed by SRC which ends
 756    at SRC_END, and set C1 and C2 to those bytes while skipping the
 757    heading multibyte characters.  If there are not enough bytes in the
 758    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 759    a multibyte character is found for C2, set C2 to the negative value
 760    of the character code.  The caller should declare and set these
 761    variables appropriately in advance:
 762         src, src_end, multibytep
 763    It is intended that this macro is used in detect_coding_utf_16.  */
 764
 765 #define TWO_MORE_BYTES(c1, c2)                          \
 766   do {                                                  \
 767     do {                                                \
 768       if (src == src_end)                               \
 769         goto no_more_source;                            \
 770       c1 = *src++;                                      \
 771       if (multibytep && (c1 & 0x80))                    \
 772         {                                               \
 773           if ((c1 & 0xFE) == 0xC0)                      \
 774             c1 = ((c1 & 1) << 6) | *src++;              \
 775           else                                          \
 776             {                                           \
 777               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 778               c1 = -1;                                  \
 779             }                                           \
 780         }                                               \
 781     } while (c1 < 0);                                   \
 782     if (src == src_end)                                 \
 783       goto no_more_source;                              \
 784     c2 = *src++;                                        \
 785     if (multibytep && (c2 & 0x80))                      \
 786       {                                                 \
 787         if ((c2 & 0xFE) == 0xC0)                        \
 788           c2 = ((c2 & 1) << 6) | *src++;                \
 789         else                                            \
 790           c2 = -1;                                      \
 791       }                                                 \
 792   } while (0)
 793
 794
 795 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 796   do {                                                  \
 797     c = *src++;                                         \
 798     if (multibytep && (c & 0x80))                       \
 799       {                                                 \
 800         if ((c & 0xFE) == 0xC0)                         \
 801           c = ((c & 1) << 6) | *src++;                  \
 802         else                                            \
 803           {                                             \
 804             src--;                                      \
 805             c = - string_char (src, &src, NULL);        \
 806             record_conversion_result                    \
 807               (coding, CODING_RESULT_INVALID_SRC);      \
 808           }                                             \
 809       }                                                 \
 810     consumed_chars++;                                   \
 811   } while (0)
 812
 813
 814 /* Store a byte C in the place pointed by DST and increment DST to the
 815    next free point, and increment PRODUCED_CHARS.  The caller should
 816    assure that C is 0..127, and declare and set the variable `dst'
 817    appropriately in advance.
 818 */
 819
 820
 821 #define EMIT_ONE_ASCII_BYTE(c)  \
 822   do {                          \
 823     produced_chars++;           \
 824     *dst++ = (c);               \
 825   } while (0)
 826
 827
 828 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 829
 830 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 831   do {                                  \
 832     produced_chars += 2;                \
 833     *dst++ = (c1), *dst++ = (c2);       \
 834   } while (0)
 835
 836
 837 /* Store a byte C in the place pointed by DST and increment DST to the
 838    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 839    nonzero, store in an appropriate multibyte from.  The caller should
 840    declare and set the variables `dst' and `multibytep' appropriately
 841    in advance.  */
 842
 843 #define EMIT_ONE_BYTE(c)                \
 844   do {                                  \
 845     produced_chars++;                   \
 846     if (multibytep)                     \
 847       {                                 \
 848         int ch = (c);                   \
 849         if (ch >= 0x80)                 \
 850           ch = BYTE8_TO_CHAR (ch);      \
 851         CHAR_STRING_ADVANCE (ch, dst);  \
 852       }                                 \
 853     else                                \
 854       *dst++ = (c);                     \
 855   } while (0)
 856
 857
 858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 859
 860 #define EMIT_TWO_BYTES(c1, c2)          \
 861   do {                                  \
 862     produced_chars += 2;                \
 863     if (multibytep)                     \
 864       {                                 \
 865         int ch;                         \
 866                                         \
 867         ch = (c1);                      \
 868         if (ch >= 0x80)                 \
 869           ch = BYTE8_TO_CHAR (ch);      \
 870         CHAR_STRING_ADVANCE (ch, dst);  \
 871         ch = (c2);                      \
 872         if (ch >= 0x80)                 \
 873           ch = BYTE8_TO_CHAR (ch);      \
 874         CHAR_STRING_ADVANCE (ch, dst);  \
 875       }                                 \
 876     else                                \
 877       {                                 \
 878         *dst++ = (c1);                  \
 879         *dst++ = (c2);                  \
 880       }                                 \
 881   } while (0)
 882
 883
 884 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 885   do {                                  \
 886     EMIT_ONE_BYTE (c1);                 \
 887     EMIT_TWO_BYTES (c2, c3);            \
 888   } while (0)
 889
 890
 891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 892   do {                                          \
 893     EMIT_TWO_BYTES (c1, c2);                    \
 894     EMIT_TWO_BYTES (c3, c4);                    \
 895   } while (0)
 896
 897
 898 /* Prototypes for static functions.  */
 899 static void record_conversion_result P_ ((struct coding_system *coding,
 900                                           enum coding_result_code result));
 901 static int detect_coding_utf_8 P_ ((struct coding_system *,
 902                                     struct coding_detection_info *info));
 903 static void decode_coding_utf_8 P_ ((struct coding_system *));
 904 static int encode_coding_utf_8 P_ ((struct coding_system *));
 905
 906 static int detect_coding_utf_16 P_ ((struct coding_system *,
 907                                      struct coding_detection_info *info));
 908 static void decode_coding_utf_16 P_ ((struct coding_system *));
 909 static int encode_coding_utf_16 P_ ((struct coding_system *));
 910
 911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 912                                        struct coding_detection_info *info));
 913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 915
 916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 917                                          struct coding_detection_info *info));
 918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 920
 921 static int detect_coding_sjis P_ ((struct coding_system *,
 922                                    struct coding_detection_info *info));
 923 static void decode_coding_sjis P_ ((struct coding_system *));
 924 static int encode_coding_sjis P_ ((struct coding_system *));
 925
 926 static int detect_coding_big5 P_ ((struct coding_system *,
 927                                    struct coding_detection_info *info));
 928 static void decode_coding_big5 P_ ((struct coding_system *));
 929 static int encode_coding_big5 P_ ((struct coding_system *));
 930
 931 static int detect_coding_ccl P_ ((struct coding_system *,
 932                                   struct coding_detection_info *info));
 933 static void decode_coding_ccl P_ ((struct coding_system *));
 934 static int encode_coding_ccl P_ ((struct coding_system *));
 935
 936 static void decode_coding_raw_text P_ ((struct coding_system *));
 937 static int encode_coding_raw_text P_ ((struct coding_system *));
 938
 939 static void coding_set_source P_ ((struct coding_system *));
 940 static void coding_set_destination P_ ((struct coding_system *));
 941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 943                                             EMACS_INT, EMACS_INT));
 944 static unsigned char *alloc_destination P_ ((struct coding_system *,
 945                                              EMACS_INT, unsigned char *));
 946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 947 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 948                                                      int *, int *,
 949                                                      unsigned char *));
 950 static int detect_eol P_ ((const unsigned char *,
 951                            EMACS_INT, enum coding_category));
 952 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 953 static void decode_eol P_ ((struct coding_system *));
 954 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 955 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 956 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 957 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 958                                         EMACS_INT));
 959 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 960 static int decode_coding P_ ((struct coding_system *));
 961 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 962                                                       struct coding_system *,
 963                                                       int *, EMACS_INT *));
 964 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 965                                                   struct coding_system *,
 966                                                   int *, EMACS_INT *));
 967 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 968 static int encode_coding P_ ((struct coding_system *));
 969 static Lisp_Object make_conversion_work_buffer P_ ((int));
 970 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 971 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 972 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 973
 974 static void
 975 record_conversion_result (struct coding_system *coding,
 976                           enum coding_result_code result)
 977 {
 978   coding->result = result;
 979   switch (result)
 980     {
 981     case CODING_RESULT_INSUFFICIENT_SRC:
 982       Vlast_code_conversion_error = Qinsufficient_source;
 983       break;
 984     case CODING_RESULT_INCONSISTENT_EOL:
 985       Vlast_code_conversion_error = Qinconsistent_eol;
 986       break;
 987     case CODING_RESULT_INVALID_SRC:
 988       Vlast_code_conversion_error = Qinvalid_source;
 989       break;
 990     case CODING_RESULT_INTERRUPT:
 991       Vlast_code_conversion_error = Qinterrupted;
 992       break;
 993     case CODING_RESULT_INSUFFICIENT_MEM:
 994       Vlast_code_conversion_error = Qinsufficient_memory;
 995       break;
 996     case CODING_RESULT_INSUFFICIENT_DST:
 997       /* Don't record this error in Vlast_code_conversion_error
 998          because it happens just temporarily and is resolved when the
 999          whole conversion is finished.  */
1000       break;
1001     case CODING_RESULT_SUCCESS:
1002       break;
1003     default:
1004       Vlast_code_conversion_error = intern ("Unknown error");
1005     }
1006 }
1007
1008 /* This wrapper macro is used to preserve validity of pointers into
1009    buffer text across calls to decode_char, which could cause
1010    relocation of buffers if it loads a charset map, because loading a
1011    charset map allocates large structures.  */
1012 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1013   do {                                                                       \
1014     charset_map_loaded = 0;                                                  \
1015     c = DECODE_CHAR (charset, code);                                         \
1016     if (charset_map_loaded)                                                  \
1017       {                                                                      \
1018         const unsigned char *orig = coding->source;                          \
1019         EMACS_INT offset;                                                    \
1020                                                                              \
1021         coding_set_source (coding);                                          \
1022         offset = coding->source - orig;                                      \
1023         src += offset;                                                       \
1024         src_base += offset;                                                  \
1025         src_end += offset;                                                   \
1026       }                                                                      \
1027   } while (0)
1028
1029
1030 /* If there are at least BYTES length of room at dst, allocate memory
1031    for coding->destination and update dst and dst_end.  We don't have
1032    to take care of coding->source which will be relocated.  It is
1033    handled by calling coding_set_source in encode_coding.  */
1034
1035 #define ASSURE_DESTINATION(bytes)                               \
1036   do {                                                          \
1037     if (dst + (bytes) >= dst_end)                               \
1038       {                                                         \
1039         int more_bytes = charbuf_end - charbuf + (bytes);       \
1040                                                                 \
1041         dst = alloc_destination (coding, more_bytes, dst);      \
1042         dst_end = coding->destination + coding->dst_bytes;      \
1043       }                                                         \
1044   } while (0)
1045
1046
1047 /* Store multibyte form of the character C in P, and advance P to the
1048    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1049    never calls MAYBE_UNIFY_CHAR.  */
1050
1051 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1052   do {                                          \
1053     if ((c) <= MAX_1_BYTE_CHAR)                 \
1054       *(p)++ = (c);                             \
1055     else if ((c) <= MAX_2_BYTE_CHAR)            \
1056       *(p)++ = (0xC0 | ((c) >> 6)),             \
1057         *(p)++ = (0x80 | ((c) & 0x3F));         \
1058     else if ((c) <= MAX_3_BYTE_CHAR)            \
1059       *(p)++ = (0xE0 | ((c) >> 12)),            \
1060         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1061         *(p)++ = (0x80 | ((c) & 0x3F));         \
1062     else if ((c) <= MAX_4_BYTE_CHAR)            \
1063       *(p)++ = (0xF0 | (c >> 18)),              \
1064         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1065         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1066         *(p)++ = (0x80 | (c & 0x3F));           \
1067     else if ((c) <= MAX_5_BYTE_CHAR)            \
1068       *(p)++ = 0xF8,                            \
1069         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1070         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1071         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1072         *(p)++ = (0x80 | (c & 0x3F));           \
1073     else                                        \
1074       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1075   } while (0)
1076
1077
1078 /* Return the character code of character whose multibyte form is at
1079    P, and advance P to the end of the multibyte form.  This is like
1080    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1081
1082 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1083   (!((p)[0] & 0x80)                                             \
1084    ? *(p)++                                                     \
1085    : ! ((p)[0] & 0x20)                                          \
1086    ? ((p) += 2,                                                 \
1087       ((((p)[-2] & 0x1F) << 6)                                  \
1088        | ((p)[-1] & 0x3F)                                       \
1089        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1090    : ! ((p)[0] & 0x10)                                          \
1091    ? ((p) += 3,                                                 \
1092       ((((p)[-3] & 0x0F) << 12)                                 \
1093        | (((p)[-2] & 0x3F) << 6)                                \
1094        | ((p)[-1] & 0x3F)))                                     \
1095    : ! ((p)[0] & 0x08)                                          \
1096    ? ((p) += 4,                                                 \
1097       ((((p)[-4] & 0xF) << 18)                                  \
1098        | (((p)[-3] & 0x3F) << 12)                               \
1099        | (((p)[-2] & 0x3F) << 6)                                \
1100        | ((p)[-1] & 0x3F)))                                     \
1101    : ((p) += 5,                                                 \
1102       ((((p)[-4] & 0x3F) << 18)                                 \
1103        | (((p)[-3] & 0x3F) << 12)                               \
1104        | (((p)[-2] & 0x3F) << 6)                                \
1105        | ((p)[-1] & 0x3F))))
1106
1107
1108 static void
1109 coding_set_source (coding)
1110      struct coding_system *coding;
1111 {
1112   if (BUFFERP (coding->src_object))
1113     {
1114       struct buffer *buf = XBUFFER (coding->src_object);
1115
1116       if (coding->src_pos < 0)
1117         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1118       else
1119         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1120     }
1121   else if (STRINGP (coding->src_object))
1122     {
1123       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1124     }
1125   else
1126     /* Otherwise, the source is C string and is never relocated
1127        automatically.  Thus we don't have to update anything.  */
1128     ;
1129 }
1130
1131 static void
1132 coding_set_destination (coding)
1133      struct coding_system *coding;
1134 {
1135   if (BUFFERP (coding->dst_object))
1136     {
1137       if (coding->src_pos < 0)
1138         {
1139           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1140           coding->dst_bytes = (GAP_END_ADDR
1141                                - (coding->src_bytes - coding->consumed)
1142                                - coding->destination);
1143         }
1144       else
1145         {
1146           /* We are sure that coding->dst_pos_byte is before the gap
1147              of the buffer. */
1148           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1149                                  + coding->dst_pos_byte - BEG_BYTE);
1150           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1151                                - coding->destination);
1152         }
1153     }
1154   else
1155     /* Otherwise, the destination is C string and is never relocated
1156        automatically.  Thus we don't have to update anything.  */
1157     ;
1158 }
1159
1160
1161 static void
1162 coding_alloc_by_realloc (coding, bytes)
1163      struct coding_system *coding;
1164      EMACS_INT bytes;
1165 {
1166   coding->destination = (unsigned char *) xrealloc (coding->destination,
1167                                                     coding->dst_bytes + bytes);
1168   coding->dst_bytes += bytes;
1169 }
1170
1171 static void
1172 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1173      struct coding_system *coding;
1174      EMACS_INT gap_head_used, bytes;
1175 {
1176   if (EQ (coding->src_object, coding->dst_object))
1177     {
1178       /* The gap may contain the produced data at the head and not-yet
1179          consumed data at the tail.  To preserve those data, we at
1180          first make the gap size to zero, then increase the gap
1181          size.  */
1182       EMACS_INT add = GAP_SIZE;
1183
1184       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1185       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1186       make_gap (bytes);
1187       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1188       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1189     }
1190   else
1191     {
1192       Lisp_Object this_buffer;
1193
1194       this_buffer = Fcurrent_buffer ();
1195       set_buffer_internal (XBUFFER (coding->dst_object));
1196       make_gap (bytes);
1197       set_buffer_internal (XBUFFER (this_buffer));
1198     }
1199 }
1200
1201
1202 static unsigned char *
1203 alloc_destination (coding, nbytes, dst)
1204      struct coding_system *coding;
1205      EMACS_INT nbytes;
1206      unsigned char *dst;
1207 {
1208   EMACS_INT offset = dst - coding->destination;
1209
1210   if (BUFFERP (coding->dst_object))
1211     {
1212       struct buffer *buf = XBUFFER (coding->dst_object);
1213
1214       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1215     }
1216   else
1217     coding_alloc_by_realloc (coding, nbytes);
1218   coding_set_destination (coding);
1219   dst = coding->destination + offset;
1220   return dst;
1221 }
1222
1223 /** Macros for annotations.  */
1224
1225 /* An annotation data is stored in the array coding->charbuf in this
1226    format:
1227      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1228    LENGTH is the number of elements in the annotation.
1229    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1230    NCHARS is the number of characters in the text annotated.
1231
1232    The format of the following elements depend on ANNOTATION_MASK.
1233
1234    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1235    follows:
1236      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1237
1238    NBYTES is the number of bytes specified in the header part of
1239    old-style emacs-mule encoding, or 0 for the other kind of
1240    composition.
1241
1242    METHOD is one of enum composition_method.
1243
1244    Optionnal COMPOSITION-COMPONENTS are characters and composition
1245    rules.
1246
1247    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1248    follows.
1249
1250    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1251    recover from an invalid annotation, and should be skipped by
1252    produce_annotation.  */
1253
1254 /* Maximum length of the header of annotation data.  */
1255 #define MAX_ANNOTATION_LENGTH 5
1256
1257 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1258   do {                                                  \
1259     *(buf)++ = -(len);                                  \
1260     *(buf)++ = (mask);                                  \
1261     *(buf)++ = (nchars);                                \
1262     coding->annotated = 1;                              \
1263   } while (0);
1264
1265 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1266   do {                                                                      \
1267     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1268     *buf++ = nbytes;                                                        \
1269     *buf++ = method;                                                        \
1270   } while (0)
1271
1272
1273 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1274   do {                                                                  \
1275     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1276     *buf++ = id;                                                        \
1277   } while (0)
1278
1279 \f
1280 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1281
1282
1283
1284 \f
1285 /*** 3. UTF-8 ***/
1286
1287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1288    Check if a text is encoded in UTF-8.  If it is, return 1, else
1289    return 0.  */
1290
1291 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1292 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1293 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1294 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1295 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1296 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1297
1298 #define UTF_BOM 0xFEFF
1299 #define UTF_8_BOM_1 0xEF
1300 #define UTF_8_BOM_2 0xBB
1301 #define UTF_8_BOM_3 0xBF
1302
1303 static int
1304 detect_coding_utf_8 (coding, detect_info)
1305      struct coding_system *coding;
1306      struct coding_detection_info *detect_info;
1307 {
1308   const unsigned char *src = coding->source, *src_base;
1309   const unsigned char *src_end = coding->source + coding->src_bytes;
1310   int multibytep = coding->src_multibyte;
1311   int consumed_chars = 0;
1312   int bom_found = 0;
1313   int found = 0;
1314
1315   detect_info->checked |= CATEGORY_MASK_UTF_8;
1316   /* A coding system of this category is always ASCII compatible.  */
1317   src += coding->head_ascii;
1318
1319   while (1)
1320     {
1321       int c, c1, c2, c3, c4;
1322
1323       src_base = src;
1324       ONE_MORE_BYTE (c);
1325       if (c < 0 || UTF_8_1_OCTET_P (c))
1326         continue;
1327       ONE_MORE_BYTE (c1);
1328       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1329         break;
1330       if (UTF_8_2_OCTET_LEADING_P (c))
1331         {
1332           found = 1;
1333           continue;
1334         }
1335       ONE_MORE_BYTE (c2);
1336       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1337         break;
1338       if (UTF_8_3_OCTET_LEADING_P (c))
1339         {
1340           found = 1;
1341           if (src_base == coding->source
1342               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1343             bom_found = 1;
1344           continue;
1345         }
1346       ONE_MORE_BYTE (c3);
1347       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1348         break;
1349       if (UTF_8_4_OCTET_LEADING_P (c))
1350         {
1351           found = 1;
1352           continue;
1353         }
1354       ONE_MORE_BYTE (c4);
1355       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1356         break;
1357       if (UTF_8_5_OCTET_LEADING_P (c))
1358         {
1359           found = 1;
1360           continue;
1361         }
1362       break;
1363     }
1364   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1365   return 0;
1366
1367  no_more_source:
1368   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1369     {
1370       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1371       return 0;
1372     }
1373   if (bom_found)
1374     {
1375       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1376       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1377     }
1378   else
1379     {
1380       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1381       if (found)
1382         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1383     }
1384   return 1;
1385 }
1386
1387
1388 static void
1389 decode_coding_utf_8 (coding)
1390      struct coding_system *coding;
1391 {
1392   const unsigned char *src = coding->source + coding->consumed;
1393   const unsigned char *src_end = coding->source + coding->src_bytes;
1394   const unsigned char *src_base;
1395   int *charbuf = coding->charbuf + coding->charbuf_used;
1396   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1397   int consumed_chars = 0, consumed_chars_base = 0;
1398   int multibytep = coding->src_multibyte;
1399   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1400   Lisp_Object attr, charset_list;
1401   int eol_crlf =
1402     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1403   int byte_after_cr = -1;
1404
1405   CODING_GET_INFO (coding, attr, charset_list);
1406
1407   if (bom != utf_without_bom)
1408     {
1409       int c1, c2, c3;
1410
1411       src_base = src;
1412       ONE_MORE_BYTE (c1);
1413       if (! UTF_8_3_OCTET_LEADING_P (c1))
1414         src = src_base;
1415       else
1416         {
1417           ONE_MORE_BYTE (c2);
1418           if (! UTF_8_EXTRA_OCTET_P (c2))
1419             src = src_base;
1420           else
1421             {
1422               ONE_MORE_BYTE (c3);
1423               if (! UTF_8_EXTRA_OCTET_P (c3))
1424                 src = src_base;
1425               else
1426                 {
1427                   if ((c1 != UTF_8_BOM_1)
1428                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1429                     src = src_base;
1430                   else
1431                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1432                 }
1433             }
1434         }
1435     }
1436   CODING_UTF_8_BOM (coding) = utf_without_bom;
1437
1438
1439
1440   while (1)
1441     {
1442       int c, c1, c2, c3, c4, c5;
1443
1444       src_base = src;
1445       consumed_chars_base = consumed_chars;
1446
1447       if (charbuf >= charbuf_end)
1448         {
1449           if (byte_after_cr >= 0)
1450             src_base--;
1451           break;
1452         }
1453
1454       if (byte_after_cr >= 0)
1455         c1 = byte_after_cr, byte_after_cr = -1;
1456       else
1457         ONE_MORE_BYTE (c1);
1458       if (c1 < 0)
1459         {
1460           c = - c1;
1461         }
1462       else if (UTF_8_1_OCTET_P(c1))
1463         {
1464           if (eol_crlf && c1 == '\r')
1465             ONE_MORE_BYTE (byte_after_cr);
1466           c = c1;
1467         }
1468       else
1469         {
1470           ONE_MORE_BYTE (c2);
1471           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1472             goto invalid_code;
1473           if (UTF_8_2_OCTET_LEADING_P (c1))
1474             {
1475               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1476               /* Reject overlong sequences here and below.  Encoders
1477                  producing them are incorrect, they can be misleading,
1478                  and they mess up read/write invariance.  */
1479               if (c < 128)
1480                 goto invalid_code;
1481             }
1482           else
1483             {
1484               ONE_MORE_BYTE (c3);
1485               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1486                 goto invalid_code;
1487               if (UTF_8_3_OCTET_LEADING_P (c1))
1488                 {
1489                   c = (((c1 & 0xF) << 12)
1490                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1491                   if (c < 0x800
1492                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1493                     goto invalid_code;
1494                 }
1495               else
1496                 {
1497                   ONE_MORE_BYTE (c4);
1498                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1499                     goto invalid_code;
1500                   if (UTF_8_4_OCTET_LEADING_P (c1))
1501                     {
1502                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1503                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1504                     if (c < 0x10000)
1505                       goto invalid_code;
1506                     }
1507                   else
1508                     {
1509                       ONE_MORE_BYTE (c5);
1510                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1511                         goto invalid_code;
1512                       if (UTF_8_5_OCTET_LEADING_P (c1))
1513                         {
1514                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1515                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1516                                | (c5 & 0x3F));
1517                           if ((c > MAX_CHAR) || (c < 0x200000))
1518                             goto invalid_code;
1519                         }
1520                       else
1521                         goto invalid_code;
1522                     }
1523                 }
1524             }
1525         }
1526
1527       *charbuf++ = c;
1528       continue;
1529
1530     invalid_code:
1531       src = src_base;
1532       consumed_chars = consumed_chars_base;
1533       ONE_MORE_BYTE (c);
1534       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1535       coding->errors++;
1536     }
1537
1538  no_more_source:
1539   coding->consumed_char += consumed_chars_base;
1540   coding->consumed = src_base - coding->source;
1541   coding->charbuf_used = charbuf - coding->charbuf;
1542 }
1543
1544
1545 static int
1546 encode_coding_utf_8 (coding)
1547      struct coding_system *coding;
1548 {
1549   int multibytep = coding->dst_multibyte;
1550   int *charbuf = coding->charbuf;
1551   int *charbuf_end = charbuf + coding->charbuf_used;
1552   unsigned char *dst = coding->destination + coding->produced;
1553   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1554   int produced_chars = 0;
1555   int c;
1556
1557   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1558     {
1559       ASSURE_DESTINATION (3);
1560       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1561       CODING_UTF_8_BOM (coding) = utf_without_bom;
1562     }
1563
1564   if (multibytep)
1565     {
1566       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1567
1568       while (charbuf < charbuf_end)
1569         {
1570           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1571
1572           ASSURE_DESTINATION (safe_room);
1573           c = *charbuf++;
1574           if (CHAR_BYTE8_P (c))
1575             {
1576               c = CHAR_TO_BYTE8 (c);
1577               EMIT_ONE_BYTE (c);
1578             }
1579           else
1580             {
1581               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1582               for (p = str; p < pend; p++)
1583                 EMIT_ONE_BYTE (*p);
1584             }
1585         }
1586     }
1587   else
1588     {
1589       int safe_room = MAX_MULTIBYTE_LENGTH;
1590
1591       while (charbuf < charbuf_end)
1592         {
1593           ASSURE_DESTINATION (safe_room);
1594           c = *charbuf++;
1595           if (CHAR_BYTE8_P (c))
1596             *dst++ = CHAR_TO_BYTE8 (c);
1597           else
1598             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1599           produced_chars++;
1600         }
1601     }
1602   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1603   coding->produced_char += produced_chars;
1604   coding->produced = dst - coding->destination;
1605   return 0;
1606 }
1607
1608
1609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1610    Check if a text is encoded in one of UTF-16 based coding systems.
1611    If it is, return 1, else return 0.  */
1612
1613 #define UTF_16_HIGH_SURROGATE_P(val) \
1614   (((val) & 0xFC00) == 0xD800)
1615
1616 #define UTF_16_LOW_SURROGATE_P(val) \
1617   (((val) & 0xFC00) == 0xDC00)
1618
1619 #define UTF_16_INVALID_P(val)   \
1620   (((val) == 0xFFFE)            \
1621    || ((val) == 0xFFFF)         \
1622    || UTF_16_LOW_SURROGATE_P (val))
1623
1624
1625 static int
1626 detect_coding_utf_16 (coding, detect_info)
1627      struct coding_system *coding;
1628      struct coding_detection_info *detect_info;
1629 {
1630   const unsigned char *src = coding->source, *src_base = src;
1631   const unsigned char *src_end = coding->source + coding->src_bytes;
1632   int multibytep = coding->src_multibyte;
1633   int consumed_chars = 0;
1634   int c1, c2;
1635
1636   detect_info->checked |= CATEGORY_MASK_UTF_16;
1637   if (coding->mode & CODING_MODE_LAST_BLOCK
1638       && (coding->src_chars & 1))
1639     {
1640       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1641       return 0;
1642     }
1643
1644   TWO_MORE_BYTES (c1, c2);
1645   if ((c1 == 0xFF) && (c2 == 0xFE))
1646     {
1647       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1648                              | CATEGORY_MASK_UTF_16_AUTO);
1649       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1650                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1651                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1652     }
1653   else if ((c1 == 0xFE) && (c2 == 0xFF))
1654     {
1655       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1656                              | CATEGORY_MASK_UTF_16_AUTO);
1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1658                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1659                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1660     }
1661   else if (c2 < 0)
1662     {
1663       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1664       return 0;
1665     }
1666   else
1667     {
1668       /* We check the dispersion of Eth and Oth bytes where E is even and
1669          O is odd.  If both are high, we assume binary data.*/
1670       unsigned char e[256], o[256];
1671       unsigned e_num = 1, o_num = 1;
1672
1673       memset (e, 0, 256);
1674       memset (o, 0, 256);
1675       e[c1] = 1;
1676       o[c2] = 1;
1677
1678       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1679                                 |CATEGORY_MASK_UTF_16_BE
1680                                 | CATEGORY_MASK_UTF_16_LE);
1681
1682       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1683              != CATEGORY_MASK_UTF_16)
1684         {
1685           TWO_MORE_BYTES (c1, c2);
1686           if (c2 < 0)
1687             break;
1688           if (! e[c1])
1689             {
1690               e[c1] = 1;
1691               e_num++;
1692               if (e_num >= 128)
1693                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1694             }
1695           if (! o[c2])
1696             {
1697               o[c2] = 1;
1698               o_num++;
1699               if (o_num >= 128)
1700                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1701             }
1702         }
1703       return 0;
1704     }
1705
1706  no_more_source:
1707   return 1;
1708 }
1709
1710 static void
1711 decode_coding_utf_16 (coding)
1712      struct coding_system *coding;
1713 {
1714   const unsigned char *src = coding->source + coding->consumed;
1715   const unsigned char *src_end = coding->source + coding->src_bytes;
1716   const unsigned char *src_base;
1717   int *charbuf = coding->charbuf + coding->charbuf_used;
1718   /* We may produces at most 3 chars in one loop.  */
1719   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1720   int consumed_chars = 0, consumed_chars_base = 0;
1721   int multibytep = coding->src_multibyte;
1722   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1723   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1724   int surrogate = CODING_UTF_16_SURROGATE (coding);
1725   Lisp_Object attr, charset_list;
1726   int eol_crlf =
1727     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1728   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1729
1730   CODING_GET_INFO (coding, attr, charset_list);
1731
1732   if (bom == utf_with_bom)
1733     {
1734       int c, c1, c2;
1735
1736       src_base = src;
1737       ONE_MORE_BYTE (c1);
1738       ONE_MORE_BYTE (c2);
1739       c = (c1 << 8) | c2;
1740
1741       if (endian == utf_16_big_endian
1742           ? c != 0xFEFF : c != 0xFFFE)
1743         {
1744           /* The first two bytes are not BOM.  Treat them as bytes
1745              for a normal character.  */
1746           src = src_base;
1747           coding->errors++;
1748         }
1749       CODING_UTF_16_BOM (coding) = utf_without_bom;
1750     }
1751   else if (bom == utf_detect_bom)
1752     {
1753       /* We have already tried to detect BOM and failed in
1754          detect_coding.  */
1755       CODING_UTF_16_BOM (coding) = utf_without_bom;
1756     }
1757
1758   while (1)
1759     {
1760       int c, c1, c2;
1761
1762       src_base = src;
1763       consumed_chars_base = consumed_chars;
1764
1765       if (charbuf >= charbuf_end)
1766         {
1767           if (byte_after_cr1 >= 0)
1768             src_base -= 2;
1769           break;
1770         }
1771
1772       if (byte_after_cr1 >= 0)
1773         c1 = byte_after_cr1, byte_after_cr1 = -1;
1774       else
1775         ONE_MORE_BYTE (c1);
1776       if (c1 < 0)
1777         {
1778           *charbuf++ = -c1;
1779           continue;
1780         }
1781       if (byte_after_cr2 >= 0)
1782         c2 = byte_after_cr2, byte_after_cr2 = -1;
1783       else
1784         ONE_MORE_BYTE (c2);
1785       if (c2 < 0)
1786         {
1787           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1788           *charbuf++ = -c2;
1789           continue;
1790         }
1791       c = (endian == utf_16_big_endian
1792            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1793
1794       if (surrogate)
1795         {
1796           if (! UTF_16_LOW_SURROGATE_P (c))
1797             {
1798               if (endian == utf_16_big_endian)
1799                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1800               else
1801                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1802               *charbuf++ = c1;
1803               *charbuf++ = c2;
1804               coding->errors++;
1805               if (UTF_16_HIGH_SURROGATE_P (c))
1806                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1807               else
1808                 *charbuf++ = c;
1809             }
1810           else
1811             {
1812               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1813               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1814               *charbuf++ = 0x10000 + c;
1815             }
1816         }
1817       else
1818         {
1819           if (UTF_16_HIGH_SURROGATE_P (c))
1820             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1821           else
1822             {
1823               if (eol_crlf && c == '\r')
1824                 {
1825                   ONE_MORE_BYTE (byte_after_cr1);
1826                   ONE_MORE_BYTE (byte_after_cr2);
1827                 }
1828               *charbuf++ = c;
1829             }
1830         }
1831     }
1832
1833  no_more_source:
1834   coding->consumed_char += consumed_chars_base;
1835   coding->consumed = src_base - coding->source;
1836   coding->charbuf_used = charbuf - coding->charbuf;
1837 }
1838
1839 static int
1840 encode_coding_utf_16 (coding)
1841      struct coding_system *coding;
1842 {
1843   int multibytep = coding->dst_multibyte;
1844   int *charbuf = coding->charbuf;
1845   int *charbuf_end = charbuf + coding->charbuf_used;
1846   unsigned char *dst = coding->destination + coding->produced;
1847   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1848   int safe_room = 8;
1849   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1850   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1851   int produced_chars = 0;
1852   Lisp_Object attrs, charset_list;
1853   int c;
1854
1855   CODING_GET_INFO (coding, attrs, charset_list);
1856
1857   if (bom != utf_without_bom)
1858     {
1859       ASSURE_DESTINATION (safe_room);
1860       if (big_endian)
1861         EMIT_TWO_BYTES (0xFE, 0xFF);
1862       else
1863         EMIT_TWO_BYTES (0xFF, 0xFE);
1864       CODING_UTF_16_BOM (coding) = utf_without_bom;
1865     }
1866
1867   while (charbuf < charbuf_end)
1868     {
1869       ASSURE_DESTINATION (safe_room);
1870       c = *charbuf++;
1871       if (c > MAX_UNICODE_CHAR)
1872         c = coding->default_char;
1873
1874       if (c < 0x10000)
1875         {
1876           if (big_endian)
1877             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1878           else
1879             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1880         }
1881       else
1882         {
1883           int c1, c2;
1884
1885           c -= 0x10000;
1886           c1 = (c >> 10) + 0xD800;
1887           c2 = (c & 0x3FF) + 0xDC00;
1888           if (big_endian)
1889             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1890           else
1891             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1892         }
1893     }
1894   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1895   coding->produced = dst - coding->destination;
1896   coding->produced_char += produced_chars;
1897   return 0;
1898 }
1899
1900 \f
1901 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1902
1903 /* Emacs' internal format for representation of multiple character
1904    sets is a kind of multi-byte encoding, i.e. characters are
1905    represented by variable-length sequences of one-byte codes.
1906
1907    ASCII characters and control characters (e.g. `tab', `newline') are
1908    represented by one-byte sequences which are their ASCII codes, in
1909    the range 0x00 through 0x7F.
1910
1911    8-bit characters of the range 0x80..0x9F are represented by
1912    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1913    code + 0x20).
1914
1915    8-bit characters of the range 0xA0..0xFF are represented by
1916    one-byte sequences which are their 8-bit code.
1917
1918    The other characters are represented by a sequence of `base
1919    leading-code', optional `extended leading-code', and one or two
1920    `position-code's.  The length of the sequence is determined by the
1921    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1922    whereas extended leading-code and position-code take the range 0xA0
1923    through 0xFF.  See `charset.h' for more details about leading-code
1924    and position-code.
1925
1926    --- CODE RANGE of Emacs' internal format ---
1927    character set        range
1928    -------------        -----
1929    ascii                0x00..0x7F
1930    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1931    eight-bit-graphic    0xA0..0xBF
1932    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1933    ---------------------------------------------
1934
1935    As this is the internal character representation, the format is
1936    usually not used externally (i.e. in a file or in a data sent to a
1937    process).  But, it is possible to have a text externally in this
1938    format (i.e. by encoding by the coding system `emacs-mule').
1939
1940    In that case, a sequence of one-byte codes has a slightly different
1941    form.
1942
1943    At first, all characters in eight-bit-control are represented by
1944    one-byte sequences which are their 8-bit code.
1945
1946    Next, character composition data are represented by the byte
1947    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1948    where,
1949         METHOD is 0xF2 plus one of composition method (enum
1950         composition_method),
1951
1952         BYTES is 0xA0 plus a byte length of this composition data,
1953
1954         CHARS is 0xA0 plus a number of characters composed by this
1955         data,
1956
1957         COMPONENTs are characters of multibye form or composition
1958         rules encoded by two-byte of ASCII codes.
1959
1960    In addition, for backward compatibility, the following formats are
1961    also recognized as composition data on decoding.
1962
1963    0x80 MSEQ ...
1964    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1965
1966    Here,
1967         MSEQ is a multibyte form but in these special format:
1968           ASCII: 0xA0 ASCII_CODE+0x80,
1969           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1970         RULE is a one byte code of the range 0xA0..0xF0 that
1971         represents a composition rule.
1972   */
1973
1974 char emacs_mule_bytes[256];
1975
1976
1977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1978    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1979    else return 0.  */
1980
1981 static int
1982 detect_coding_emacs_mule (coding, detect_info)
1983      struct coding_system *coding;
1984      struct coding_detection_info *detect_info;
1985 {
1986   const unsigned char *src = coding->source, *src_base;
1987   const unsigned char *src_end = coding->source + coding->src_bytes;
1988   int multibytep = coding->src_multibyte;
1989   int consumed_chars = 0;
1990   int c;
1991   int found = 0;
1992
1993   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1994   /* A coding system of this category is always ASCII compatible.  */
1995   src += coding->head_ascii;
1996
1997   while (1)
1998     {
1999       src_base = src;
2000       ONE_MORE_BYTE (c);
2001       if (c < 0)
2002         continue;
2003       if (c == 0x80)
2004         {
2005           /* Perhaps the start of composite character.  We simply skip
2006              it because analyzing it is too heavy for detecting.  But,
2007              at least, we check that the composite character
2008              constitutes of more than 4 bytes.  */
2009           const unsigned char *src_base;
2010
2011         repeat:
2012           src_base = src;
2013           do
2014             {
2015               ONE_MORE_BYTE (c);
2016             }
2017           while (c >= 0xA0);
2018
2019           if (src - src_base <= 4)
2020             break;
2021           found = CATEGORY_MASK_EMACS_MULE;
2022           if (c == 0x80)
2023             goto repeat;
2024         }
2025
2026       if (c < 0x80)
2027         {
2028           if (c < 0x20
2029               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2030             break;
2031         }
2032       else
2033         {
2034           int more_bytes = emacs_mule_bytes[c] - 1;
2035
2036           while (more_bytes > 0)
2037             {
2038               ONE_MORE_BYTE (c);
2039               if (c < 0xA0)
2040                 {
2041                   src--;        /* Unread the last byte.  */
2042                   break;
2043                 }
2044               more_bytes--;
2045             }
2046           if (more_bytes != 0)
2047             break;
2048           found = CATEGORY_MASK_EMACS_MULE;
2049         }
2050     }
2051   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2052   return 0;
2053
2054  no_more_source:
2055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2056     {
2057       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2058       return 0;
2059     }
2060   detect_info->found |= found;
2061   return 1;
2062 }
2063
2064
2065 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2066    character.  If CMP_STATUS indicates that we must expect MSEQ or
2067    RULE described above, decode it and return the negative value of
2068    the decoded character or rule.  If an invalid byte is found, return
2069    -1.  If SRC is too short, return -2.  */
2070
2071 int
2072 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2073      struct coding_system *coding;
2074      const unsigned char *src;
2075      int *nbytes, *nchars, *id;
2076      struct composition_status *cmp_status;
2077 {
2078   const unsigned char *src_end = coding->source + coding->src_bytes;
2079   const unsigned char *src_base = src;
2080   int multibytep = coding->src_multibyte;
2081   struct charset *charset;
2082   unsigned code;
2083   int c;
2084   int consumed_chars = 0;
2085   int mseq_found = 0;
2086
2087   ONE_MORE_BYTE (c);
2088   if (c < 0)
2089     {
2090       c = -c;
2091       charset = emacs_mule_charset[0];
2092     }
2093   else
2094     {
2095       if (c >= 0xA0)
2096         {
2097           if (cmp_status->state != COMPOSING_NO
2098               && cmp_status->old_form)
2099             {
2100               if (cmp_status->state == COMPOSING_CHAR)
2101                 {
2102                   if (c == 0xA0)
2103                     {
2104                       ONE_MORE_BYTE (c);
2105                       c -= 0x80;
2106                       if (c < 0)
2107                         goto invalid_code;
2108                     }
2109                   else
2110                     c -= 0x20;
2111                   mseq_found = 1;
2112                 }
2113               else
2114                 {
2115                   *nbytes = src - src_base;
2116                   *nchars = consumed_chars;
2117                   return -c;
2118                 }
2119             }
2120           else
2121             goto invalid_code;
2122         }
2123
2124       switch (emacs_mule_bytes[c])
2125         {
2126         case 2:
2127           if (! (charset = emacs_mule_charset[c]))
2128             goto invalid_code;
2129           ONE_MORE_BYTE (c);
2130           if (c < 0xA0)
2131             goto invalid_code;
2132           code = c & 0x7F;
2133           break;
2134
2135         case 3:
2136           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2137               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2138             {
2139               ONE_MORE_BYTE (c);
2140               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2141                 goto invalid_code;
2142               ONE_MORE_BYTE (c);
2143               if (c < 0xA0)
2144                 goto invalid_code;
2145               code = c & 0x7F;
2146             }
2147           else
2148             {
2149               if (! (charset = emacs_mule_charset[c]))
2150                 goto invalid_code;
2151               ONE_MORE_BYTE (c);
2152               if (c < 0xA0)
2153                 goto invalid_code;
2154               code = (c & 0x7F) << 8;
2155               ONE_MORE_BYTE (c);
2156               if (c < 0xA0)
2157                 goto invalid_code;
2158               code |= c & 0x7F;
2159             }
2160           break;
2161
2162         case 4:
2163           ONE_MORE_BYTE (c);
2164           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2165             goto invalid_code;
2166           ONE_MORE_BYTE (c);
2167           if (c < 0xA0)
2168             goto invalid_code;
2169           code = (c & 0x7F) << 8;
2170           ONE_MORE_BYTE (c);
2171           if (c < 0xA0)
2172             goto invalid_code;
2173           code |= c & 0x7F;
2174           break;
2175
2176         case 1:
2177           code = c;
2178           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2179                                      ? charset_ascii : charset_eight_bit);
2180           break;
2181
2182         default:
2183           abort ();
2184         }
2185       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, code, c);
2186       if (c < 0)
2187         goto invalid_code;
2188     }
2189   *nbytes = src - src_base;
2190   *nchars = consumed_chars;
2191   if (id)
2192     *id = charset->id;
2193   return (mseq_found ? -c : c);
2194
2195  no_more_source:
2196   return -2;
2197
2198  invalid_code:
2199   return -1;
2200 }
2201
2202
2203 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2204
2205 /* Handle these composition sequence ('|': the end of header elements,
2206    BYTES and CHARS >= 0xA0):
2207
2208    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2209    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2210    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2211
2212    and these old form:
2213
2214    (4) relative composition: 0x80 | MSEQ ... MSEQ
2215    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2216
2217    When the starter 0x80 and the following header elements are found,
2218    this annotation header is produced.
2219
2220         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2221
2222    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2223    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2224
2225    Then, upon reading the following elements, these codes are produced
2226    until the composition end is found:
2227
2228    (1) CHAR ... CHAR
2229    (2) ALT ... ALT CHAR ... CHAR
2230    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2231    (4) CHAR ... CHAR
2232    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2233
2234    When the composition end is found, LENGTH and NCHARS in the
2235    annotation header is updated as below:
2236
2237    (1) LENGTH: unchanged, NCHARS: unchanged
2238    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2239    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2240    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2241    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2242
2243    If an error is found while composing, the annotation header is
2244    changed to the original composition header (plus filler -1s) as
2245    below:
2246
2247    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2248    (5)          [ 0x80 0xFF -1 -1- -1 ]
2249
2250    and the sequence [ -2 DECODED-RULE ] is changed to the original
2251    byte sequence as below:
2252         o the original byte sequence is B: [ B -1 ]
2253         o the original byte sequence is B1 B2: [ B1 B2 ]
2254
2255    Most of the routines are implemented by macros because many
2256    variables and labels in the caller decode_coding_emacs_mule must be
2257    accessible, and they are usually called just once (thus doesn't
2258    increase the size of compiled object).  */
2259
2260 /* Decode a composition rule represented by C as a component of
2261    composition sequence of Emacs 20 style.  Set RULE to the decoded
2262    rule. */
2263
2264 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2265   do {                                                  \
2266     int gref, nref;                                     \
2267                                                         \
2268     c -= 0xA0;                                          \
2269     if (c < 0 || c >= 81)                               \
2270       goto invalid_code;                                \
2271     gref = c / 9, nref = c % 9;                         \
2272     if (gref == 4) gref = 10;                           \
2273     if (nref == 4) nref = 10;                           \
2274     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2275   } while (0)
2276
2277
2278 /* Decode a composition rule represented by C and the following byte
2279    at SRC as a component of composition sequence of Emacs 21 style.
2280    Set RULE to the decoded rule.  */
2281
2282 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2283   do {                                                  \
2284     int gref, nref;                                     \
2285                                                         \
2286     gref = c - 0x20;                                    \
2287     if (gref < 0 || gref >= 81)                         \
2288       goto invalid_code;                                \
2289     ONE_MORE_BYTE (c);                                  \
2290     nref = c - 0x20;                                    \
2291     if (nref < 0 || nref >= 81)                         \
2292       goto invalid_code;                                \
2293     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2294   } while (0)
2295
2296
2297 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2298    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2299    byte length of this composition information, CHARS is the number of
2300    characters composed by this composition.  */
2301
2302 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2303   do {                                                                  \
2304     enum composition_method method = c - 0xF2;                          \
2305     int *charbuf_base = charbuf;                                        \
2306     int nbytes, nchars;                                                 \
2307                                                                         \
2308     ONE_MORE_BYTE (c);                                                  \
2309     if (c < 0)                                                          \
2310       goto invalid_code;                                                \
2311     nbytes = c - 0xA0;                                                  \
2312     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2313       goto invalid_code;                                                \
2314     ONE_MORE_BYTE (c);                                                  \
2315     nchars = c - 0xA0;                                                  \
2316     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2317       goto invalid_code;                                                \
2318     cmp_status->old_form = 0;                                           \
2319     cmp_status->method = method;                                        \
2320     if (method == COMPOSITION_RELATIVE)                                 \
2321       cmp_status->state = COMPOSING_CHAR;                               \
2322     else                                                                \
2323       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2325     cmp_status->nchars = nchars;                                        \
2326     cmp_status->ncomps = nbytes - 4;                                    \
2327     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2328   } while (0)
2329
2330
2331 /* Start of Emacs 20 style format for relative composition.  */
2332
2333 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2334   do {                                                          \
2335     cmp_status->old_form = 1;                                   \
2336     cmp_status->method = COMPOSITION_RELATIVE;                  \
2337     cmp_status->state = COMPOSING_CHAR;                         \
2338     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2339     cmp_status->nchars = cmp_status->ncomps = 0;                \
2340     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2341   } while (0)
2342
2343
2344 /* Start of Emacs 20 style format for rule-base composition.  */
2345
2346 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2347   do {                                                          \
2348     cmp_status->old_form = 1;                                   \
2349     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2350     cmp_status->state = COMPOSING_CHAR;                         \
2351     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2352     cmp_status->nchars = cmp_status->ncomps = 0;                \
2353     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2354   } while (0)
2355
2356
2357 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2358   do {                                                  \
2359     const unsigned char *current_src = src;             \
2360                                                         \
2361     ONE_MORE_BYTE (c);                                  \
2362     if (c < 0)                                          \
2363       goto invalid_code;                                \
2364     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2365         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2366       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2367     else if (c < 0xA0)                                  \
2368       goto invalid_code;                                \
2369     else if (c < 0xC0)                                  \
2370       {                                                 \
2371         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2372         /* Re-read C as a composition component.  */    \
2373         src = current_src;                              \
2374       }                                                 \
2375     else if (c == 0xFF)                                 \
2376       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2377     else                                                \
2378       goto invalid_code;                                \
2379   } while (0)
2380
2381 #define EMACS_MULE_COMPOSITION_END()                            \
2382   do {                                                          \
2383     int idx = - cmp_status->length;                             \
2384                                                                 \
2385     if (cmp_status->old_form)                                   \
2386       charbuf[idx + 2] = cmp_status->nchars;                    \
2387     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2388       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2389     cmp_status->state = COMPOSING_NO;                           \
2390   } while (0)
2391
2392
2393 static int
2394 emacs_mule_finish_composition (charbuf, cmp_status)
2395      int *charbuf;
2396      struct composition_status *cmp_status;
2397 {
2398   int idx = - cmp_status->length;
2399   int new_chars;
2400
2401   if (cmp_status->old_form && cmp_status->nchars > 0)
2402     {
2403       charbuf[idx + 2] = cmp_status->nchars;
2404       new_chars = 0;
2405       if (cmp_status->method == COMPOSITION_WITH_RULE
2406           && cmp_status->state == COMPOSING_CHAR)
2407         {
2408           /* The last rule was invalid.  */
2409           int rule = charbuf[-1] + 0xA0;
2410
2411           charbuf[-2] = BYTE8_TO_CHAR (rule);
2412           charbuf[-1] = -1;
2413           new_chars = 1;
2414         }
2415     }
2416   else
2417     {
2418       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2419
2420       if (cmp_status->method == COMPOSITION_WITH_RULE)
2421         {
2422           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2423           charbuf[idx++] = -3;
2424           charbuf[idx++] = 0;
2425           new_chars = 1;
2426         }
2427       else
2428         {
2429           int nchars = charbuf[idx + 1] + 0xA0;
2430           int nbytes = charbuf[idx + 2] + 0xA0;
2431
2432           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2433           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2434           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2435           charbuf[idx++] = -1;
2436           new_chars = 4;
2437         }
2438     }
2439   cmp_status->state = COMPOSING_NO;
2440   return new_chars;
2441 }
2442
2443 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2444   do {                                                                    \
2445     if (cmp_status->state != COMPOSING_NO)                                \
2446       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2447   } while (0)
2448
2449
2450 static void
2451 decode_coding_emacs_mule (coding)
2452      struct coding_system *coding;
2453 {
2454   const unsigned char *src = coding->source + coding->consumed;
2455   const unsigned char *src_end = coding->source + coding->src_bytes;
2456   const unsigned char *src_base;
2457   int *charbuf = coding->charbuf + coding->charbuf_used;
2458   /* We may produce two annocations (charset and composition) in one
2459      loop and one more charset annocation at the end.  */
2460   int *charbuf_end
2461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2462   int consumed_chars = 0, consumed_chars_base;
2463   int multibytep = coding->src_multibyte;
2464   Lisp_Object attrs, charset_list;
2465   int char_offset = coding->produced_char;
2466   int last_offset = char_offset;
2467   int last_id = charset_ascii;
2468   int eol_crlf =
2469     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2470   int byte_after_cr = -1;
2471   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2472
2473   CODING_GET_INFO (coding, attrs, charset_list);
2474
2475   if (cmp_status->state != COMPOSING_NO)
2476     {
2477       int i;
2478
2479       for (i = 0; i < cmp_status->length; i++)
2480         *charbuf++ = cmp_status->carryover[i];
2481       coding->annotated = 1;
2482     }
2483
2484   while (1)
2485     {
2486       int c, id;
2487
2488       src_base = src;
2489       consumed_chars_base = consumed_chars;
2490
2491       if (charbuf >= charbuf_end)
2492         {
2493           if (byte_after_cr >= 0)
2494             src_base--;
2495           break;
2496         }
2497
2498       if (byte_after_cr >= 0)
2499         c = byte_after_cr, byte_after_cr = -1;
2500       else
2501         ONE_MORE_BYTE (c);
2502
2503       if (c < 0 || c == 0x80)
2504         {
2505           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2506           if (c < 0)
2507             {
2508               *charbuf++ = -c;
2509               char_offset++;
2510             }
2511           else
2512             DECODE_EMACS_MULE_COMPOSITION_START ();
2513           continue;
2514         }
2515
2516       if (c < 0x80)
2517         {
2518           if (eol_crlf && c == '\r')
2519             ONE_MORE_BYTE (byte_after_cr);
2520           id = charset_ascii;
2521           if (cmp_status->state != COMPOSING_NO)
2522             {
2523               if (cmp_status->old_form)
2524                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2525               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2526                 cmp_status->ncomps--;
2527             }
2528         }
2529       else
2530         {
2531           int nchars, nbytes;
2532           /* emacs_mule_char can load a charset map from a file, which
2533              allocates a large structure and might cause buffer text
2534              to be relocated as result.  Thus, we need to remember the
2535              original pointer to buffer text, and fixup all related
2536              pointers after the call.  */
2537           const unsigned char *orig = coding->source;
2538           EMACS_INT offset;
2539
2540           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2541                                cmp_status);
2542           offset = coding->source - orig;
2543           if (offset)
2544             {
2545               src += offset;
2546               src_base += offset;
2547               src_end += offset;
2548             }
2549           if (c < 0)
2550             {
2551               if (c == -1)
2552                 goto invalid_code;
2553               if (c == -2)
2554                 break;
2555             }
2556           src = src_base + nbytes;
2557           consumed_chars = consumed_chars_base + nchars;
2558           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2559             cmp_status->ncomps -= nchars;
2560         }
2561
2562       /* Now if C >= 0, we found a normally encoded characer, if C <
2563          0, we found an old-style composition component character or
2564          rule.  */
2565
2566       if (cmp_status->state == COMPOSING_NO)
2567         {
2568           if (last_id != id)
2569             {
2570               if (last_id != charset_ascii)
2571                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2572                                   last_id);
2573               last_id = id;
2574               last_offset = char_offset;
2575             }
2576           *charbuf++ = c;
2577           char_offset++;
2578         }
2579       else if (cmp_status->state == COMPOSING_CHAR)
2580         {
2581           if (cmp_status->old_form)
2582             {
2583               if (c >= 0)
2584                 {
2585                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2586                   *charbuf++ = c;
2587                   char_offset++;
2588                 }
2589               else
2590                 {
2591                   *charbuf++ = -c;
2592                   cmp_status->nchars++;
2593                   cmp_status->length++;
2594                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2595                     EMACS_MULE_COMPOSITION_END ();
2596                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2597                     cmp_status->state = COMPOSING_RULE;
2598                 }
2599             }
2600           else
2601             {
2602               *charbuf++ = c;
2603               cmp_status->length++;
2604               cmp_status->nchars--;
2605               if (cmp_status->nchars == 0)
2606                 EMACS_MULE_COMPOSITION_END ();
2607             }
2608         }
2609       else if (cmp_status->state == COMPOSING_RULE)
2610         {
2611           int rule;
2612
2613           if (c >= 0)
2614             {
2615               EMACS_MULE_COMPOSITION_END ();
2616               *charbuf++ = c;
2617               char_offset++;
2618             }
2619           else
2620             {
2621               c = -c;
2622               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2623               if (rule < 0)
2624                 goto invalid_code;
2625               *charbuf++ = -2;
2626               *charbuf++ = rule;
2627               cmp_status->length += 2;
2628               cmp_status->state = COMPOSING_CHAR;
2629             }
2630         }
2631       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2632         {
2633           *charbuf++ = c;
2634           cmp_status->length++;
2635           if (cmp_status->ncomps == 0)
2636             cmp_status->state = COMPOSING_CHAR;
2637           else if (cmp_status->ncomps > 0)
2638             {
2639               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2640                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2641             }
2642           else
2643             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2644         }
2645       else                      /* COMPOSING_COMPONENT_RULE */
2646         {
2647           int rule;
2648
2649           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2650           if (rule < 0)
2651             goto invalid_code;
2652           *charbuf++ = -2;
2653           *charbuf++ = rule;
2654           cmp_status->length += 2;
2655           cmp_status->ncomps--;
2656           if (cmp_status->ncomps > 0)
2657             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2658           else
2659             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2660         }
2661       continue;
2662
2663     retry:
2664       src = src_base;
2665       consumed_chars = consumed_chars_base;
2666       continue;
2667
2668     invalid_code:
2669       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2670       src = src_base;
2671       consumed_chars = consumed_chars_base;
2672       ONE_MORE_BYTE (c);
2673       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2674       char_offset++;
2675       coding->errors++;
2676     }
2677
2678  no_more_source:
2679   if (cmp_status->state != COMPOSING_NO)
2680     {
2681       if (coding->mode & CODING_MODE_LAST_BLOCK)
2682         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2683       else
2684         {
2685           int i;
2686
2687           charbuf -= cmp_status->length;
2688           for (i = 0; i < cmp_status->length; i++)
2689             cmp_status->carryover[i] = charbuf[i];
2690         }
2691     }
2692   if (last_id != charset_ascii)
2693     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2694   coding->consumed_char += consumed_chars_base;
2695   coding->consumed = src_base - coding->source;
2696   coding->charbuf_used = charbuf - coding->charbuf;
2697 }
2698
2699
2700 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2701   do {                                          \
2702     if (id < 0xA0)                              \
2703       codes[0] = id, codes[1] = 0;              \
2704     else if (id < 0xE0)                         \
2705       codes[0] = 0x9A, codes[1] = id;           \
2706     else if (id < 0xF0)                         \
2707       codes[0] = 0x9B, codes[1] = id;           \
2708     else if (id < 0xF5)                         \
2709       codes[0] = 0x9C, codes[1] = id;           \
2710     else                                        \
2711       codes[0] = 0x9D, codes[1] = id;           \
2712   } while (0);
2713
2714
2715 static int
2716 encode_coding_emacs_mule (coding)
2717      struct coding_system *coding;
2718 {
2719   int multibytep = coding->dst_multibyte;
2720   int *charbuf = coding->charbuf;
2721   int *charbuf_end = charbuf + coding->charbuf_used;
2722   unsigned char *dst = coding->destination + coding->produced;
2723   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2724   int safe_room = 8;
2725   int produced_chars = 0;
2726   Lisp_Object attrs, charset_list;
2727   int c;
2728   int preferred_charset_id = -1;
2729
2730   CODING_GET_INFO (coding, attrs, charset_list);
2731   if (! EQ (charset_list, Vemacs_mule_charset_list))
2732     {
2733       CODING_ATTR_CHARSET_LIST (attrs)
2734         = charset_list = Vemacs_mule_charset_list;
2735     }
2736
2737   while (charbuf < charbuf_end)
2738     {
2739       ASSURE_DESTINATION (safe_room);
2740       c = *charbuf++;
2741
2742       if (c < 0)
2743         {
2744           /* Handle an annotation.  */
2745           switch (*charbuf)
2746             {
2747             case CODING_ANNOTATE_COMPOSITION_MASK:
2748               /* Not yet implemented.  */
2749               break;
2750             case CODING_ANNOTATE_CHARSET_MASK:
2751               preferred_charset_id = charbuf[3];
2752               if (preferred_charset_id >= 0
2753                   && NILP (Fmemq (make_number (preferred_charset_id),
2754                                   charset_list)))
2755                 preferred_charset_id = -1;
2756               break;
2757             default:
2758               abort ();
2759             }
2760           charbuf += -c - 1;
2761           continue;
2762         }
2763
2764       if (ASCII_CHAR_P (c))
2765         EMIT_ONE_ASCII_BYTE (c);
2766       else if (CHAR_BYTE8_P (c))
2767         {
2768           c = CHAR_TO_BYTE8 (c);
2769           EMIT_ONE_BYTE (c);
2770         }
2771       else
2772         {
2773           struct charset *charset;
2774           unsigned code;
2775           int dimension;
2776           int emacs_mule_id;
2777           unsigned char leading_codes[2];
2778
2779           if (preferred_charset_id >= 0)
2780             {
2781               charset = CHARSET_FROM_ID (preferred_charset_id);
2782               if (CHAR_CHARSET_P (c, charset))
2783                 code = ENCODE_CHAR (charset, c);
2784               else
2785                 charset = char_charset (c, charset_list, &code);
2786             }
2787           else
2788             charset = char_charset (c, charset_list, &code);
2789           if (! charset)
2790             {
2791               c = coding->default_char;
2792               if (ASCII_CHAR_P (c))
2793                 {
2794                   EMIT_ONE_ASCII_BYTE (c);
2795                   continue;
2796                 }
2797               charset = char_charset (c, charset_list, &code);
2798             }
2799           dimension = CHARSET_DIMENSION (charset);
2800           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2801           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2802           EMIT_ONE_BYTE (leading_codes[0]);
2803           if (leading_codes[1])
2804             EMIT_ONE_BYTE (leading_codes[1]);
2805           if (dimension == 1)
2806             EMIT_ONE_BYTE (code | 0x80);
2807           else
2808             {
2809               code |= 0x8080;
2810               EMIT_ONE_BYTE (code >> 8);
2811               EMIT_ONE_BYTE (code & 0xFF);
2812             }
2813         }
2814     }
2815   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2816   coding->produced_char += produced_chars;
2817   coding->produced = dst - coding->destination;
2818   return 0;
2819 }
2820
2821 \f
2822 /*** 7. ISO2022 handlers ***/
2823
2824 /* The following note describes the coding system ISO2022 briefly.
2825    Since the intention of this note is to help understand the
2826    functions in this file, some parts are NOT ACCURATE or are OVERLY
2827    SIMPLIFIED.  For thorough understanding, please refer to the
2828    original document of ISO2022.  This is equivalent to the standard
2829    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2830
2831    ISO2022 provides many mechanisms to encode several character sets
2832    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2833    is encoded using bytes less than 128.  This may make the encoded
2834    text a little bit longer, but the text passes more easily through
2835    several types of gateway, some of which strip off the MSB (Most
2836    Significant Bit).
2837
2838    There are two kinds of character sets: control character sets and
2839    graphic character sets.  The former contain control characters such
2840    as `newline' and `escape' to provide control functions (control
2841    functions are also provided by escape sequences).  The latter
2842    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2843    two control character sets and many graphic character sets.
2844
2845    Graphic character sets are classified into one of the following
2846    four classes, according to the number of bytes (DIMENSION) and
2847    number of characters in one dimension (CHARS) of the set:
2848    - DIMENSION1_CHARS94
2849    - DIMENSION1_CHARS96
2850    - DIMENSION2_CHARS94
2851    - DIMENSION2_CHARS96
2852
2853    In addition, each character set is assigned an identification tag,
2854    unique for each set, called the "final character" (denoted as <F>
2855    hereafter).  The <F> of each character set is decided by ECMA(*)
2856    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2857    (0x30..0x3F are for private use only).
2858
2859    Note (*): ECMA = European Computer Manufacturers Association
2860
2861    Here are examples of graphic character sets [NAME(<F>)]:
2862         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2863         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2864         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2865         o DIMENSION2_CHARS96 -- none for the moment
2866
2867    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2868         C0 [0x00..0x1F] -- control character plane 0
2869         GL [0x20..0x7F] -- graphic character plane 0
2870         C1 [0x80..0x9F] -- control character plane 1
2871         GR [0xA0..0xFF] -- graphic character plane 1
2872
2873    A control character set is directly designated and invoked to C0 or
2874    C1 by an escape sequence.  The most common case is that:
2875    - ISO646's  control character set is designated/invoked to C0, and
2876    - ISO6429's control character set is designated/invoked to C1,
2877    and usually these designations/invocations are omitted in encoded
2878    text.  In a 7-bit environment, only C0 can be used, and a control
2879    character for C1 is encoded by an appropriate escape sequence to
2880    fit into the environment.  All control characters for C1 are
2881    defined to have corresponding escape sequences.
2882
2883    A graphic character set is at first designated to one of four
2884    graphic registers (G0 through G3), then these graphic registers are
2885    invoked to GL or GR.  These designations and invocations can be
2886    done independently.  The most common case is that G0 is invoked to
2887    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2888    these invocations and designations are omitted in encoded text.
2889    In a 7-bit environment, only GL can be used.
2890
2891    When a graphic character set of CHARS94 is invoked to GL, codes
2892    0x20 and 0x7F of the GL area work as control characters SPACE and
2893    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2894    be used.
2895
2896    There are two ways of invocation: locking-shift and single-shift.
2897    With locking-shift, the invocation lasts until the next different
2898    invocation, whereas with single-shift, the invocation affects the
2899    following character only and doesn't affect the locking-shift
2900    state.  Invocations are done by the following control characters or
2901    escape sequences:
2902
2903    ----------------------------------------------------------------------
2904    abbrev  function                  cntrl escape seq   description
2905    ----------------------------------------------------------------------
2906    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2907    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2908    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2909    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2910    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2911    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2912    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2913    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2914    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2915    ----------------------------------------------------------------------
2916    (*) These are not used by any known coding system.
2917
2918    Control characters for these functions are defined by macros
2919    ISO_CODE_XXX in `coding.h'.
2920
2921    Designations are done by the following escape sequences:
2922    ----------------------------------------------------------------------
2923    escape sequence      description
2924    ----------------------------------------------------------------------
2925    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2926    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2927    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2928    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2929    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2930    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2931    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2932    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2933    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2934    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2935    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2936    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2937    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2938    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2939    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2940    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2941    ----------------------------------------------------------------------
2942
2943    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2944    of dimension 1, chars 94, and final character <F>, etc...
2945
2946    Note (*): Although these designations are not allowed in ISO2022,
2947    Emacs accepts them on decoding, and produces them on encoding
2948    CHARS96 character sets in a coding system which is characterized as
2949    7-bit environment, non-locking-shift, and non-single-shift.
2950
2951    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2952    '(' must be omitted.  We refer to this as "short-form" hereafter.
2953
2954    Now you may notice that there are a lot of ways of encoding the
2955    same multilingual text in ISO2022.  Actually, there exist many
2956    coding systems such as Compound Text (used in X11's inter client
2957    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2958    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2959    localized platforms), and all of these are variants of ISO2022.
2960
2961    In addition to the above, Emacs handles two more kinds of escape
2962    sequences: ISO6429's direction specification and Emacs' private
2963    sequence for specifying character composition.
2964
2965    ISO6429's direction specification takes the following form:
2966         o CSI ']'      -- end of the current direction
2967         o CSI '0' ']'  -- end of the current direction
2968         o CSI '1' ']'  -- start of left-to-right text
2969         o CSI '2' ']'  -- start of right-to-left text
2970    The control character CSI (0x9B: control sequence introducer) is
2971    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2972
2973    Character composition specification takes the following form:
2974         o ESC '0' -- start relative composition
2975         o ESC '1' -- end composition
2976         o ESC '2' -- start rule-base composition (*)
2977         o ESC '3' -- start relative composition with alternate chars  (**)
2978         o ESC '4' -- start rule-base composition with alternate chars  (**)
2979   Since these are not standard escape sequences of any ISO standard,
2980   the use of them with these meanings is restricted to Emacs only.
2981
2982   (*) This form is used only in Emacs 20.7 and older versions,
2983   but newer versions can safely decode it.
2984   (**) This form is used only in Emacs 21.1 and newer versions,
2985   and older versions can't decode it.
2986
2987   Here's a list of example usages of these composition escape
2988   sequences (categorized by `enum composition_method').
2989
2990   COMPOSITION_RELATIVE:
2991         ESC 0 CHAR [ CHAR ] ESC 1
2992   COMPOSITION_WITH_RULE:
2993         ESC 2 CHAR [ RULE CHAR ] ESC 1
2994   COMPOSITION_WITH_ALTCHARS:
2995         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2996   COMPOSITION_WITH_RULE_ALTCHARS:
2997         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2998
2999 enum iso_code_class_type iso_code_class[256];
3000
3001 #define SAFE_CHARSET_P(coding, id)      \
3002   ((id) <= (coding)->max_charset_id     \
3003    && (coding)->safe_charsets[id] != 255)
3004
3005
3006 #define SHIFT_OUT_OK(category)  \
3007   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
3008
3009 static void
3010 setup_iso_safe_charsets (attrs)
3011      Lisp_Object attrs;
3012 {
3013   Lisp_Object charset_list, safe_charsets;
3014   Lisp_Object request;
3015   Lisp_Object reg_usage;
3016   Lisp_Object tail;
3017   int reg94, reg96;
3018   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
3019   int max_charset_id;
3020
3021   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3022   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
3023       && ! EQ (charset_list, Viso_2022_charset_list))
3024     {
3025       CODING_ATTR_CHARSET_LIST (attrs)
3026         = charset_list = Viso_2022_charset_list;
3027       ASET (attrs, coding_attr_safe_charsets, Qnil);
3028     }
3029
3030   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3031     return;
3032
3033   max_charset_id = 0;
3034   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3035     {
3036       int id = XINT (XCAR (tail));
3037       if (max_charset_id < id)
3038         max_charset_id = id;
3039     }
3040
3041   safe_charsets = make_uninit_string (max_charset_id + 1);
3042   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3043   request = AREF (attrs, coding_attr_iso_request);
3044   reg_usage = AREF (attrs, coding_attr_iso_usage);
3045   reg94 = XINT (XCAR (reg_usage));
3046   reg96 = XINT (XCDR (reg_usage));
3047
3048   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3049     {
3050       Lisp_Object id;
3051       Lisp_Object reg;
3052       struct charset *charset;
3053
3054       id = XCAR (tail);
3055       charset = CHARSET_FROM_ID (XINT (id));
3056       reg = Fcdr (Fassq (id, request));
3057       if (! NILP (reg))
3058         SSET (safe_charsets, XINT (id), XINT (reg));
3059       else if (charset->iso_chars_96)
3060         {
3061           if (reg96 < 4)
3062             SSET (safe_charsets, XINT (id), reg96);
3063         }
3064       else
3065         {
3066           if (reg94 < 4)
3067             SSET (safe_charsets, XINT (id), reg94);
3068         }
3069     }
3070   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3071 }
3072
3073
3074 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3075    Check if a text is encoded in one of ISO-2022 based codig systems.
3076    If it is, return 1, else return 0.  */
3077
3078 static int
3079 detect_coding_iso_2022 (coding, detect_info)
3080      struct coding_system *coding;
3081      struct coding_detection_info *detect_info;
3082 {
3083   const unsigned char *src = coding->source, *src_base = src;
3084   const unsigned char *src_end = coding->source + coding->src_bytes;
3085   int multibytep = coding->src_multibyte;
3086   int single_shifting = 0;
3087   int id;
3088   int c, c1;
3089   int consumed_chars = 0;
3090   int i;
3091   int rejected = 0;
3092   int found = 0;
3093   int composition_count = -1;
3094
3095   detect_info->checked |= CATEGORY_MASK_ISO;
3096
3097   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3098     {
3099       struct coding_system *this = &(coding_categories[i]);
3100       Lisp_Object attrs, val;
3101
3102       if (this->id < 0)
3103         continue;
3104       attrs = CODING_ID_ATTRS (this->id);
3105       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3106           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3107         setup_iso_safe_charsets (attrs);
3108       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3109       this->max_charset_id = SCHARS (val) - 1;
3110       this->safe_charsets = SDATA (val);
3111     }
3112
3113   /* A coding system of this category is always ASCII compatible.  */
3114   src += coding->head_ascii;
3115
3116   while (rejected != CATEGORY_MASK_ISO)
3117     {
3118       src_base = src;
3119       ONE_MORE_BYTE (c);
3120       switch (c)
3121         {
3122         case ISO_CODE_ESC:
3123           if (inhibit_iso_escape_detection)
3124             break;
3125           single_shifting = 0;
3126           ONE_MORE_BYTE (c);
3127           if (c >= '(' && c <= '/')
3128             {
3129               /* Designation sequence for a charset of dimension 1.  */
3130               ONE_MORE_BYTE (c1);
3131               if (c1 < ' ' || c1 >= 0x80
3132                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3133                 /* Invalid designation sequence.  Just ignore.  */
3134                 break;
3135             }
3136           else if (c == '$')
3137             {
3138               /* Designation sequence for a charset of dimension 2.  */
3139               ONE_MORE_BYTE (c);
3140               if (c >= '@' && c <= 'B')
3141                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3142                 id = iso_charset_table[1][0][c];
3143               else if (c >= '(' && c <= '/')
3144                 {
3145                   ONE_MORE_BYTE (c1);
3146                   if (c1 < ' ' || c1 >= 0x80
3147                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3148                     /* Invalid designation sequence.  Just ignore.  */
3149                     break;
3150                 }
3151               else
3152                 /* Invalid designation sequence.  Just ignore it.  */
3153                 break;
3154             }
3155           else if (c == 'N' || c == 'O')
3156             {
3157               /* ESC <Fe> for SS2 or SS3.  */
3158               single_shifting = 1;
3159               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160               break;
3161             }
3162           else if (c == '1')
3163             {
3164               /* End of composition.  */
3165               if (composition_count < 0
3166                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3167                 /* Invalid */
3168                 break;
3169               composition_count = -1;
3170               found |= CATEGORY_MASK_ISO;
3171             }
3172           else if (c >= '0' && c <= '4')
3173             {
3174               /* ESC <Fp> for start/end composition.  */
3175               composition_count = 0;
3176               break;
3177             }
3178           else
3179             {
3180               /* Invalid escape sequence.  Just ignore it.  */
3181               break;
3182             }
3183
3184           /* We found a valid designation sequence for CHARSET.  */
3185           rejected |= CATEGORY_MASK_ISO_8BIT;
3186           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3187                               id))
3188             found |= CATEGORY_MASK_ISO_7;
3189           else
3190             rejected |= CATEGORY_MASK_ISO_7;
3191           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3192                               id))
3193             found |= CATEGORY_MASK_ISO_7_TIGHT;
3194           else
3195             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3196           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3197                               id))
3198             found |= CATEGORY_MASK_ISO_7_ELSE;
3199           else
3200             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3201           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3202                               id))
3203             found |= CATEGORY_MASK_ISO_8_ELSE;
3204           else
3205             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3206           break;
3207
3208         case ISO_CODE_SO:
3209         case ISO_CODE_SI:
3210           /* Locking shift out/in.  */
3211           if (inhibit_iso_escape_detection)
3212             break;
3213           single_shifting = 0;
3214           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3215           break;
3216
3217         case ISO_CODE_CSI:
3218           /* Control sequence introducer.  */
3219           single_shifting = 0;
3220           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3221           found |= CATEGORY_MASK_ISO_8_ELSE;
3222           goto check_extra_latin;
3223
3224         case ISO_CODE_SS2:
3225         case ISO_CODE_SS3:
3226           /* Single shift.   */
3227           if (inhibit_iso_escape_detection)
3228             break;
3229           single_shifting = 0;
3230           rejected |= CATEGORY_MASK_ISO_7BIT;
3231           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3232               & CODING_ISO_FLAG_SINGLE_SHIFT)
3233             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3234           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3235               & CODING_ISO_FLAG_SINGLE_SHIFT)
3236             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3237           if (single_shifting)
3238             break;
3239           goto check_extra_latin;
3240
3241         default:
3242           if (c < 0)
3243             continue;
3244           if (c < 0x80)
3245             {
3246               if (composition_count >= 0)
3247                 composition_count++;
3248               single_shifting = 0;
3249               break;
3250             }
3251           if (c >= 0xA0)
3252             {
3253               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3254               found |= CATEGORY_MASK_ISO_8_1;
3255               /* Check the length of succeeding codes of the range
3256                  0xA0..0FF.  If the byte length is even, we include
3257                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3258                  only when we are not single shifting.  */
3259               if (! single_shifting
3260                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3261                 {
3262                   int i = 1;
3263                   while (src < src_end)
3264                     {
3265                       src_base = src;
3266                       ONE_MORE_BYTE (c);
3267                       if (c < 0xA0)
3268                         {
3269                           src = src_base;
3270                           break;
3271                         }
3272                       i++;
3273                     }
3274
3275                   if (i & 1 && src < src_end)
3276                     {
3277                       rejected |= CATEGORY_MASK_ISO_8_2;
3278                       if (composition_count >= 0)
3279                         composition_count += i;
3280                     }
3281                   else
3282                     {
3283                       found |= CATEGORY_MASK_ISO_8_2;
3284                       if (composition_count >= 0)
3285                         composition_count += i / 2;
3286                     }
3287                 }
3288               break;
3289             }
3290         check_extra_latin:
3291           single_shifting = 0;
3292           if (! VECTORP (Vlatin_extra_code_table)
3293               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3294             {
3295               rejected = CATEGORY_MASK_ISO;
3296               break;
3297             }
3298           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3299               & CODING_ISO_FLAG_LATIN_EXTRA)
3300             found |= CATEGORY_MASK_ISO_8_1;
3301           else
3302             rejected |= CATEGORY_MASK_ISO_8_1;
3303           rejected |= CATEGORY_MASK_ISO_8_2;
3304         }
3305     }
3306   detect_info->rejected |= CATEGORY_MASK_ISO;
3307   return 0;
3308
3309  no_more_source:
3310   detect_info->rejected |= rejected;
3311   detect_info->found |= (found & ~rejected);
3312   return 1;
3313 }
3314
3315
3316 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3317    escape sequence should be kept.  */
3318 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3319   do {                                                                  \
3320     int id, prev;                                                       \
3321                                                                         \
3322     if (final < '0' || final >= 128                                     \
3323         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3324         || !SAFE_CHARSET_P (coding, id))                                \
3325       {                                                                 \
3326         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3327         chars_96 = -1;                                                  \
3328         break;                                                          \
3329       }                                                                 \
3330     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3331     if (id == charset_jisx0201_roman)                                   \
3332       {                                                                 \
3333         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3334           id = charset_ascii;                                           \
3335       }                                                                 \
3336     else if (id == charset_jisx0208_1978)                               \
3337       {                                                                 \
3338         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3339           id = charset_jisx0208;                                        \
3340       }                                                                 \
3341     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3342     /* If there was an invalid designation to REG previously, and this  \
3343        designation is ASCII to REG, we should keep this designation     \
3344        sequence.  */                                                    \
3345     if (prev == -2 && id == charset_ascii)                              \
3346       chars_96 = -1;                                                    \
3347   } while (0)
3348
3349
3350 /* Handle these composition sequence (ALT: alternate char):
3351
3352    (1) relative composition: ESC 0 CHAR ... ESC 1
3353    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3354    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3355    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3356
3357    When the start sequence (ESC 0/2/3/4) is found, this annotation
3358    header is produced.
3359
3360         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3361
3362    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3363    produced until the end sequence (ESC 1) is found:
3364
3365    (1) CHAR ... CHAR
3366    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3367    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3368    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3369
3370    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3371    annotation header is updated as below:
3372
3373    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3374    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3375    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3376    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3377
3378    If an error is found while composing, the annotation header is
3379    changed to:
3380
3381         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3382
3383    and the sequence [ -2 DECODED-RULE ] is changed to the original
3384    byte sequence as below:
3385         o the original byte sequence is B: [ B -1 ]
3386         o the original byte sequence is B1 B2: [ B1 B2 ]
3387    and the sequence [ -1 -1 ] is changed to the original byte
3388    sequence:
3389         [ ESC '0' ]
3390 */
3391
3392 /* Decode a composition rule C1 and maybe one more byte from the
3393    source, and set RULE to the encoded composition rule, NBYTES to the
3394    length of the composition rule.  If the rule is invalid, set RULE
3395    to some negative value.  */
3396
3397 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3398   do {                                                                  \
3399     rule = c1 - 32;                                                     \
3400     if (rule < 0)                                                       \
3401       break;                                                            \
3402     if (rule < 81)              /* old format (before ver.21) */        \
3403       {                                                                 \
3404         int gref = (rule) / 9;                                          \
3405         int nref = (rule) % 9;                                          \
3406         if (gref == 4) gref = 10;                                       \
3407         if (nref == 4) nref = 10;                                       \
3408         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3409         nbytes = 1;                                                     \
3410       }                                                                 \
3411     else                        /* new format (after ver.21) */         \
3412       {                                                                 \
3413         int c;                                                          \
3414                                                                         \
3415         ONE_MORE_BYTE (c);                                              \
3416         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3417         if (rule >= 0)                                                  \
3418           rule += 0x100;   /* to destinguish it from the old format */  \
3419         nbytes = 2;                                                     \
3420       }                                                                 \
3421   } while (0)
3422
3423 #define ENCODE_COMPOSITION_RULE(rule)                           \
3424   do {                                                          \
3425     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3426                                                                 \
3427     if (rule < 0x100)           /* old format */                \
3428       {                                                         \
3429         if (gref == 10) gref = 4;                               \
3430         if (nref == 10) nref = 4;                               \
3431         charbuf[idx] = 32 + gref * 9 + nref;                    \
3432         charbuf[idx + 1] = -1;                                  \
3433         new_chars++;                                            \
3434       }                                                         \
3435     else                                /* new format */        \
3436       {                                                         \
3437         charbuf[idx] = 32 + 81 + gref;                          \
3438         charbuf[idx + 1] = 32 + nref;                           \
3439         new_chars += 2;                                         \
3440       }                                                         \
3441   } while (0)
3442
3443 /* Finish the current composition as invalid.  */
3444
3445 static int finish_composition P_ ((int *, struct composition_status *));
3446
3447 static int
3448 finish_composition (charbuf, cmp_status)
3449      int *charbuf;
3450      struct composition_status *cmp_status;
3451 {
3452   int idx = - cmp_status->length;
3453   int new_chars;
3454
3455   /* Recover the original ESC sequence */
3456   charbuf[idx++] = ISO_CODE_ESC;
3457   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3458                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3459                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3460                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3461                     : '4');
3462   charbuf[idx++] = -2;
3463   charbuf[idx++] = 0;
3464   charbuf[idx++] = -1;
3465   new_chars = cmp_status->nchars;
3466   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3467     for (; idx < 0; idx++)
3468       {
3469         int elt = charbuf[idx];
3470
3471         if (elt == -2)
3472           {
3473             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3474             idx++;
3475           }
3476         else if (elt == -1)
3477           {
3478             charbuf[idx++] = ISO_CODE_ESC;
3479             charbuf[idx] = '0';
3480             new_chars += 2;
3481           }
3482       }
3483   cmp_status->state = COMPOSING_NO;
3484   return new_chars;
3485 }
3486
3487 /* If characers are under composition, finish the composition.  */
3488 #define MAYBE_FINISH_COMPOSITION()                              \
3489   do {                                                          \
3490     if (cmp_status->state != COMPOSING_NO)                      \
3491       char_offset += finish_composition (charbuf, cmp_status);  \
3492   } while (0)
3493
3494 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3495
3496    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3497    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3498    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3499    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3500
3501    Produce this annotation sequence now:
3502
3503    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3504 */
3505
3506 #define DECODE_COMPOSITION_START(c1)                                       \
3507   do {                                                                     \
3508     if (c1 == '0'                                                          \
3509         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3510              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3511             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3512                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3513       {                                                                    \
3514         *charbuf++ = -1;                                                   \
3515         *charbuf++= -1;                                                    \
3516         cmp_status->state = COMPOSING_CHAR;                                \
3517         cmp_status->length += 2;                                           \
3518       }                                                                    \
3519     else                                                                   \
3520       {                                                                    \
3521         MAYBE_FINISH_COMPOSITION ();                                       \
3522         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3523                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3524                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3525                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3526         cmp_status->state                                                  \
3527           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3528         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3529         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3530         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3531         coding->annotated = 1;                                             \
3532       }                                                                    \
3533   } while (0)
3534
3535
3536 /* Handle composition end sequence ESC 1.  */
3537
3538 #define DECODE_COMPOSITION_END()                                        \
3539   do {                                                                  \
3540     if (cmp_status->nchars == 0                                         \
3541         || ((cmp_status->state == COMPOSING_CHAR)                       \
3542             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3543       {                                                                 \
3544         MAYBE_FINISH_COMPOSITION ();                                    \
3545         goto invalid_code;                                              \
3546       }                                                                 \
3547     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3548       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3549     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3550       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3551     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3552     char_offset += cmp_status->nchars;                                  \
3553     cmp_status->state = COMPOSING_NO;                                   \
3554   } while (0)
3555
3556 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3557
3558 #define STORE_COMPOSITION_RULE(rule)    \
3559   do {                                  \
3560     *charbuf++ = -2;                    \
3561     *charbuf++ = rule;                  \
3562     cmp_status->length += 2;            \
3563     cmp_status->state--;                \
3564   } while (0)
3565
3566 /* Store a composed char or a component char C in charbuf, and update
3567    cmp_status.  */
3568
3569 #define STORE_COMPOSITION_CHAR(c)                                       \
3570   do {                                                                  \
3571     *charbuf++ = (c);                                                   \
3572     cmp_status->length++;                                               \
3573     if (cmp_status->state == COMPOSING_CHAR)                            \
3574       cmp_status->nchars++;                                             \
3575     else                                                                \
3576       cmp_status->ncomps++;                                             \
3577     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3578         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3579             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3580       cmp_status->state++;                                              \
3581   } while (0)
3582
3583
3584 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3585
3586 static void
3587 decode_coding_iso_2022 (coding)
3588      struct coding_system *coding;
3589 {
3590   const unsigned char *src = coding->source + coding->consumed;
3591   const unsigned char *src_end = coding->source + coding->src_bytes;
3592   const unsigned char *src_base;
3593   int *charbuf = coding->charbuf + coding->charbuf_used;
3594   /* We may produce two annocations (charset and composition) in one
3595      loop and one more charset annocation at the end.  */
3596   int *charbuf_end
3597     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3598   int consumed_chars = 0, consumed_chars_base;
3599   int multibytep = coding->src_multibyte;
3600   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3601   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3602   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3603   int charset_id_2, charset_id_3;
3604   struct charset *charset;
3605   int c;
3606   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3607   Lisp_Object attrs, charset_list;
3608   int char_offset = coding->produced_char;
3609   int last_offset = char_offset;
3610   int last_id = charset_ascii;
3611   int eol_crlf =
3612     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3613   int byte_after_cr = -1;
3614   int i;
3615
3616   CODING_GET_INFO (coding, attrs, charset_list);
3617   setup_iso_safe_charsets (attrs);
3618   /* Charset list may have been changed.  */
3619   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3620   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3621
3622   if (cmp_status->state != COMPOSING_NO)
3623     {
3624       for (i = 0; i < cmp_status->length; i++)
3625         *charbuf++ = cmp_status->carryover[i];
3626       coding->annotated = 1;
3627     }
3628
3629   while (1)
3630     {
3631       int c1, c2, c3;
3632
3633       src_base = src;
3634       consumed_chars_base = consumed_chars;
3635
3636       if (charbuf >= charbuf_end)
3637         {
3638           if (byte_after_cr >= 0)
3639             src_base--;
3640           break;
3641         }
3642
3643       if (byte_after_cr >= 0)
3644         c1 = byte_after_cr, byte_after_cr = -1;
3645       else
3646         ONE_MORE_BYTE (c1);
3647       if (c1 < 0)
3648         goto invalid_code;
3649
3650       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3651         {
3652           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3653           char_offset++;
3654           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3655           continue;
3656         }
3657
3658       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3659         {
3660           if (c1 == ISO_CODE_ESC)
3661             {
3662               if (src + 1 >= src_end)
3663                 goto no_more_source;
3664               *charbuf++ = ISO_CODE_ESC;
3665               char_offset++;
3666               if (src[0] == '%' && src[1] == '@')
3667                 {
3668                   src += 2;
3669                   consumed_chars += 2;
3670                   char_offset += 2;
3671                   /* We are sure charbuf can contain two more chars. */
3672                   *charbuf++ = '%';
3673                   *charbuf++ = '@';
3674                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3675                 }
3676             }
3677           else
3678             {
3679               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3680               char_offset++;
3681             }
3682           continue;
3683         }
3684
3685       if ((cmp_status->state == COMPOSING_RULE
3686            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3687           && c1 != ISO_CODE_ESC)
3688         {
3689           int rule, nbytes;
3690
3691           DECODE_COMPOSITION_RULE (rule, nbytes);
3692           if (rule < 0)
3693             goto invalid_code;
3694           STORE_COMPOSITION_RULE (rule);
3695           continue;
3696         }
3697
3698       /* We produce at most one character.  */
3699       switch (iso_code_class [c1])
3700         {
3701         case ISO_0x20_or_0x7F:
3702           if (charset_id_0 < 0
3703               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3704             /* This is SPACE or DEL.  */
3705             charset = CHARSET_FROM_ID (charset_ascii);
3706           else
3707             charset = CHARSET_FROM_ID (charset_id_0);
3708           break;
3709
3710         case ISO_graphic_plane_0:
3711           if (charset_id_0 < 0)
3712             charset = CHARSET_FROM_ID (charset_ascii);
3713           else
3714             charset = CHARSET_FROM_ID (charset_id_0);
3715           break;
3716
3717         case ISO_0xA0_or_0xFF:
3718           if (charset_id_1 < 0
3719               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3720               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3721             goto invalid_code;
3722           /* This is a graphic character, we fall down ... */
3723
3724         case ISO_graphic_plane_1:
3725           if (charset_id_1 < 0)
3726             goto invalid_code;
3727           charset = CHARSET_FROM_ID (charset_id_1);
3728           break;
3729
3730         case ISO_control_0:
3731           if (eol_crlf && c1 == '\r')
3732             ONE_MORE_BYTE (byte_after_cr);
3733           MAYBE_FINISH_COMPOSITION ();
3734           charset = CHARSET_FROM_ID (charset_ascii);
3735           break;
3736
3737         case ISO_control_1:
3738           goto invalid_code;
3739
3740         case ISO_shift_out:
3741           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3742               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3743             goto invalid_code;
3744           CODING_ISO_INVOCATION (coding, 0) = 1;
3745           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3746           continue;
3747
3748         case ISO_shift_in:
3749           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3750             goto invalid_code;
3751           CODING_ISO_INVOCATION (coding, 0) = 0;
3752           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3753           continue;
3754
3755         case ISO_single_shift_2_7:
3756           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3757             goto invalid_code;
3758         case ISO_single_shift_2:
3759           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3760             goto invalid_code;
3761           /* SS2 is handled as an escape sequence of ESC 'N' */
3762           c1 = 'N';
3763           goto label_escape_sequence;
3764
3765         case ISO_single_shift_3:
3766           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3767             goto invalid_code;
3768           /* SS2 is handled as an escape sequence of ESC 'O' */
3769           c1 = 'O';
3770           goto label_escape_sequence;
3771
3772         case ISO_control_sequence_introducer:
3773           /* CSI is handled as an escape sequence of ESC '[' ...  */
3774           c1 = '[';
3775           goto label_escape_sequence;
3776
3777         case ISO_escape:
3778           ONE_MORE_BYTE (c1);
3779         label_escape_sequence:
3780           /* Escape sequences handled here are invocation,
3781              designation, direction specification, and character
3782              composition specification.  */
3783           switch (c1)
3784             {
3785             case '&':           /* revision of following character set */
3786               ONE_MORE_BYTE (c1);
3787               if (!(c1 >= '@' && c1 <= '~'))
3788                 goto invalid_code;
3789               ONE_MORE_BYTE (c1);
3790               if (c1 != ISO_CODE_ESC)
3791                 goto invalid_code;
3792               ONE_MORE_BYTE (c1);
3793               goto label_escape_sequence;
3794
3795             case '$':           /* designation of 2-byte character set */
3796               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3797                 goto invalid_code;
3798               {
3799                 int reg, chars96;
3800
3801                 ONE_MORE_BYTE (c1);
3802                 if (c1 >= '@' && c1 <= 'B')
3803                   {     /* designation of JISX0208.1978, GB2312.1980,
3804                            or JISX0208.1980 */
3805                     reg = 0, chars96 = 0;
3806                   }
3807                 else if (c1 >= 0x28 && c1 <= 0x2B)
3808                   { /* designation of DIMENSION2_CHARS94 character set */
3809                     reg = c1 - 0x28, chars96 = 0;
3810                     ONE_MORE_BYTE (c1);
3811                   }
3812                 else if (c1 >= 0x2C && c1 <= 0x2F)
3813                   { /* designation of DIMENSION2_CHARS96 character set */
3814                     reg = c1 - 0x2C, chars96 = 1;
3815                     ONE_MORE_BYTE (c1);
3816                   }
3817                 else
3818                   goto invalid_code;
3819                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3820                 /* We must update these variables now.  */
3821                 if (reg == 0)
3822                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3823                 else if (reg == 1)
3824                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3825                 if (chars96 < 0)
3826                   goto invalid_code;
3827               }
3828               continue;
3829
3830             case 'n':           /* invocation of locking-shift-2 */
3831               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3832                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3833                 goto invalid_code;
3834               CODING_ISO_INVOCATION (coding, 0) = 2;
3835               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3836               continue;
3837
3838             case 'o':           /* invocation of locking-shift-3 */
3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3840                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3841                 goto invalid_code;
3842               CODING_ISO_INVOCATION (coding, 0) = 3;
3843               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3844               continue;
3845
3846             case 'N':           /* invocation of single-shift-2 */
3847               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3848                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3849                 goto invalid_code;
3850               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3851               if (charset_id_2 < 0)
3852                 charset = CHARSET_FROM_ID (charset_ascii);
3853               else
3854                 charset = CHARSET_FROM_ID (charset_id_2);
3855               ONE_MORE_BYTE (c1);
3856               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3857                 goto invalid_code;
3858               break;
3859
3860             case 'O':           /* invocation of single-shift-3 */
3861               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3862                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3863                 goto invalid_code;
3864               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3865               if (charset_id_3 < 0)
3866                 charset = CHARSET_FROM_ID (charset_ascii);
3867               else
3868                 charset = CHARSET_FROM_ID (charset_id_3);
3869               ONE_MORE_BYTE (c1);
3870               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3871                 goto invalid_code;
3872               break;
3873
3874             case '0': case '2': case '3': case '4': /* start composition */
3875               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3876                 goto invalid_code;
3877               if (last_id != charset_ascii)
3878                 {
3879                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3880                   last_id = charset_ascii;
3881                   last_offset = char_offset;
3882                 }
3883               DECODE_COMPOSITION_START (c1);
3884               continue;
3885
3886             case '1':           /* end composition */
3887               if (cmp_status->state == COMPOSING_NO)
3888                 goto invalid_code;
3889               DECODE_COMPOSITION_END ();
3890               continue;
3891
3892             case '[':           /* specification of direction */
3893               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3894                 goto invalid_code;
3895               /* For the moment, nested direction is not supported.
3896                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3897                  left-to-right, and nozero means right-to-left.  */
3898               ONE_MORE_BYTE (c1);
3899               switch (c1)
3900                 {
3901                 case ']':       /* end of the current direction */
3902                   coding->mode &= ~CODING_MODE_DIRECTION;
3903
3904                 case '0':       /* end of the current direction */
3905                 case '1':       /* start of left-to-right direction */
3906                   ONE_MORE_BYTE (c1);
3907                   if (c1 == ']')
3908                     coding->mode &= ~CODING_MODE_DIRECTION;
3909                   else
3910                     goto invalid_code;
3911                   break;
3912
3913                 case '2':       /* start of right-to-left direction */
3914                   ONE_MORE_BYTE (c1);
3915                   if (c1 == ']')
3916                     coding->mode |= CODING_MODE_DIRECTION;
3917                   else
3918                     goto invalid_code;
3919                   break;
3920
3921                 default:
3922                   goto invalid_code;
3923                 }
3924               continue;
3925
3926             case '%':
3927               ONE_MORE_BYTE (c1);
3928               if (c1 == '/')
3929                 {
3930                   /* CTEXT extended segment:
3931                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3932                      We keep these bytes as is for the moment.
3933                      They may be decoded by post-read-conversion.  */
3934                   int dim, M, L;
3935                   int size;
3936
3937                   ONE_MORE_BYTE (dim);
3938                   if (dim < '0' || dim > '4')
3939                     goto invalid_code;
3940                   ONE_MORE_BYTE (M);
3941                   if (M < 128)
3942                     goto invalid_code;
3943                   ONE_MORE_BYTE (L);
3944                   if (L < 128)
3945                     goto invalid_code;
3946                   size = ((M - 128) * 128) + (L - 128);
3947                   if (charbuf + 6 > charbuf_end)
3948                     goto break_loop;
3949                   *charbuf++ = ISO_CODE_ESC;
3950                   *charbuf++ = '%';
3951                   *charbuf++ = '/';
3952                   *charbuf++ = dim;
3953                   *charbuf++ = BYTE8_TO_CHAR (M);
3954                   *charbuf++ = BYTE8_TO_CHAR (L);
3955                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3956                 }
3957               else if (c1 == 'G')
3958                 {
3959                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3960                      ESC % G --UTF-8-BYTES-- ESC % @
3961                      We keep these bytes as is for the moment.
3962                      They may be decoded by post-read-conversion.  */
3963                   if (charbuf + 3 > charbuf_end)
3964                     goto break_loop;
3965                   *charbuf++ = ISO_CODE_ESC;
3966                   *charbuf++ = '%';
3967                   *charbuf++ = 'G';
3968                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3969                 }
3970               else
3971                 goto invalid_code;
3972               continue;
3973               break;
3974
3975             default:
3976               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3977                 goto invalid_code;
3978               {
3979                 int reg, chars96;
3980
3981                 if (c1 >= 0x28 && c1 <= 0x2B)
3982                   { /* designation of DIMENSION1_CHARS94 character set */
3983                     reg = c1 - 0x28, chars96 = 0;
3984                     ONE_MORE_BYTE (c1);
3985                   }
3986                 else if (c1 >= 0x2C && c1 <= 0x2F)
3987                   { /* designation of DIMENSION1_CHARS96 character set */
3988                     reg = c1 - 0x2C, chars96 = 1;
3989                     ONE_MORE_BYTE (c1);
3990                   }
3991                 else
3992                   goto invalid_code;
3993                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3994                 /* We must update these variables now.  */
3995                 if (reg == 0)
3996                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3997                 else if (reg == 1)
3998                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3999                 if (chars96 < 0)
4000                   goto invalid_code;
4001               }
4002               continue;
4003             }
4004         }
4005
4006       if (cmp_status->state == COMPOSING_NO
4007           && charset->id != charset_ascii
4008           && last_id != charset->id)
4009         {
4010           if (last_id != charset_ascii)
4011             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4012           last_id = charset->id;
4013           last_offset = char_offset;
4014         }
4015
4016       /* Now we know CHARSET and 1st position code C1 of a character.
4017          Produce a decoded character while getting 2nd and 3rd
4018          position codes C2, C3 if necessary.  */
4019       if (CHARSET_DIMENSION (charset) > 1)
4020         {
4021           ONE_MORE_BYTE (c2);
4022           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
4023               || ((c1 & 0x80) != (c2 & 0x80)))
4024             /* C2 is not in a valid range.  */
4025             goto invalid_code;
4026           if (CHARSET_DIMENSION (charset) == 2)
4027             c1 = (c1 << 8) | c2;
4028           else
4029             {
4030               ONE_MORE_BYTE (c3);
4031               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4032                   || ((c1 & 0x80) != (c3 & 0x80)))
4033                 /* C3 is not in a valid range.  */
4034                 goto invalid_code;
4035               c1 = (c1 << 16) | (c2 << 8) | c2;
4036             }
4037         }
4038       c1 &= 0x7F7F7F;
4039       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4040       if (c < 0)
4041         {
4042           MAYBE_FINISH_COMPOSITION ();
4043           for (; src_base < src; src_base++, char_offset++)
4044             {
4045               if (ASCII_BYTE_P (*src_base))
4046                 *charbuf++ = *src_base;
4047               else
4048                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4049             }
4050         }
4051       else if (cmp_status->state == COMPOSING_NO)
4052         {
4053           *charbuf++ = c;
4054           char_offset++;
4055         }
4056       else if ((cmp_status->state == COMPOSING_CHAR
4057                 ? cmp_status->nchars
4058                 : cmp_status->ncomps)
4059                >= MAX_COMPOSITION_COMPONENTS)
4060         {
4061           /* Too long composition.  */
4062           MAYBE_FINISH_COMPOSITION ();
4063           *charbuf++ = c;
4064           char_offset++;
4065         }
4066       else
4067         STORE_COMPOSITION_CHAR (c);
4068       continue;
4069
4070     invalid_code:
4071       MAYBE_FINISH_COMPOSITION ();
4072       src = src_base;
4073       consumed_chars = consumed_chars_base;
4074       ONE_MORE_BYTE (c);
4075       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4076       char_offset++;
4077       coding->errors++;
4078       continue;
4079
4080     break_loop:
4081       break;
4082     }
4083
4084  no_more_source:
4085   if (cmp_status->state != COMPOSING_NO)
4086     {
4087       if (coding->mode & CODING_MODE_LAST_BLOCK)
4088         MAYBE_FINISH_COMPOSITION ();
4089       else
4090         {
4091           charbuf -= cmp_status->length;
4092           for (i = 0; i < cmp_status->length; i++)
4093             cmp_status->carryover[i] = charbuf[i];
4094         }
4095     }
4096   else if (last_id != charset_ascii)
4097     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4098   coding->consumed_char += consumed_chars_base;
4099   coding->consumed = src_base - coding->source;
4100   coding->charbuf_used = charbuf - coding->charbuf;
4101 }
4102
4103
4104 /* ISO2022 encoding stuff.  */
4105
4106 /*
4107    It is not enough to say just "ISO2022" on encoding, we have to
4108    specify more details.  In Emacs, each coding system of ISO2022
4109    variant has the following specifications:
4110         1. Initial designation to G0 thru G3.
4111         2. Allows short-form designation?
4112         3. ASCII should be designated to G0 before control characters?
4113         4. ASCII should be designated to G0 at end of line?
4114         5. 7-bit environment or 8-bit environment?
4115         6. Use locking-shift?
4116         7. Use Single-shift?
4117    And the following two are only for Japanese:
4118         8. Use ASCII in place of JIS0201-1976-Roman?
4119         9. Use JISX0208-1983 in place of JISX0208-1978?
4120    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4121    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4122    details.
4123 */
4124
4125 /* Produce codes (escape sequence) for designating CHARSET to graphic
4126    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4127    '@', 'A', or 'B' and the coding system CODING allows, produce
4128    designation sequence of short-form.  */
4129
4130 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4131   do {                                                                  \
4132     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4133     char *intermediate_char_94 = "()*+";                                \
4134     char *intermediate_char_96 = ",-./";                                \
4135     int revision = -1;                                                  \
4136     int c;                                                              \
4137                                                                         \
4138     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4139       revision = CHARSET_ISO_REVISION (charset);                        \
4140                                                                         \
4141     if (revision >= 0)                                                  \
4142       {                                                                 \
4143         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4144         EMIT_ONE_BYTE ('@' + revision);                                 \
4145       }                                                                 \
4146     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4147     if (CHARSET_DIMENSION (charset) == 1)                               \
4148       {                                                                 \
4149         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4150           c = intermediate_char_94[reg];                                \
4151         else                                                            \
4152           c = intermediate_char_96[reg];                                \
4153         EMIT_ONE_ASCII_BYTE (c);                                        \
4154       }                                                                 \
4155     else                                                                \
4156       {                                                                 \
4157         EMIT_ONE_ASCII_BYTE ('$');                                      \
4158         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4159           {                                                             \
4160             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4161                 || reg != 0                                             \
4162                 || final_char < '@' || final_char > 'B')                \
4163               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4164           }                                                             \
4165         else                                                            \
4166           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4167       }                                                                 \
4168     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4169                                                                         \
4170     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4171   } while (0)
4172
4173
4174 /* The following two macros produce codes (control character or escape
4175    sequence) for ISO2022 single-shift functions (single-shift-2 and
4176    single-shift-3).  */
4177
4178 #define ENCODE_SINGLE_SHIFT_2                                           \
4179   do {                                                                  \
4180     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4181       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4182     else                                                                \
4183       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4184     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4185   } while (0)
4186
4187
4188 #define ENCODE_SINGLE_SHIFT_3                                           \
4189   do {                                                                  \
4190     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4191       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4192     else                                                                \
4193       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4194     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4195   } while (0)
4196
4197
4198 /* The following four macros produce codes (control character or
4199    escape sequence) for ISO2022 locking-shift functions (shift-in,
4200    shift-out, locking-shift-2, and locking-shift-3).  */
4201
4202 #define ENCODE_SHIFT_IN                                 \
4203   do {                                                  \
4204     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4205     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4206   } while (0)
4207
4208
4209 #define ENCODE_SHIFT_OUT                                \
4210   do {                                                  \
4211     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4212     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4213   } while (0)
4214
4215
4216 #define ENCODE_LOCKING_SHIFT_2                          \
4217   do {                                                  \
4218     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4219     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4220   } while (0)
4221
4222
4223 #define ENCODE_LOCKING_SHIFT_3                          \
4224   do {                                                  \
4225     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4226     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4227   } while (0)
4228
4229
4230 /* Produce codes for a DIMENSION1 character whose character set is
4231    CHARSET and whose position-code is C1.  Designation and invocation
4232    sequences are also produced in advance if necessary.  */
4233
4234 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4235   do {                                                                  \
4236     int id = CHARSET_ID (charset);                                      \
4237                                                                         \
4238     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4239         && id == charset_ascii)                                         \
4240       {                                                                 \
4241         id = charset_jisx0201_roman;                                    \
4242         charset = CHARSET_FROM_ID (id);                                 \
4243       }                                                                 \
4244                                                                         \
4245     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4246       {                                                                 \
4247         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4248           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4249         else                                                            \
4250           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4251         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4252         break;                                                          \
4253       }                                                                 \
4254     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4255       {                                                                 \
4256         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4257         break;                                                          \
4258       }                                                                 \
4259     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4260       {                                                                 \
4261         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4262         break;                                                          \
4263       }                                                                 \
4264     else                                                                \
4265       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4266          must invoke it, or, at first, designate it to some graphic     \
4267          register.  Then repeat the loop to actually produce the        \
4268          character.  */                                                 \
4269       dst = encode_invocation_designation (charset, coding, dst,        \
4270                                            &produced_chars);            \
4271   } while (1)
4272
4273
4274 /* Produce codes for a DIMENSION2 character whose character set is
4275    CHARSET and whose position-codes are C1 and C2.  Designation and
4276    invocation codes are also produced in advance if necessary.  */
4277
4278 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4279   do {                                                                  \
4280     int id = CHARSET_ID (charset);                                      \
4281                                                                         \
4282     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4283         && id == charset_jisx0208)                                      \
4284       {                                                                 \
4285         id = charset_jisx0208_1978;                                     \
4286         charset = CHARSET_FROM_ID (id);                                 \
4287       }                                                                 \
4288                                                                         \
4289     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4290       {                                                                 \
4291         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4292           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4293         else                                                            \
4294           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4295         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4296         break;                                                          \
4297       }                                                                 \
4298     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4299       {                                                                 \
4300         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4301         break;                                                          \
4302       }                                                                 \
4303     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4304       {                                                                 \
4305         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4306         break;                                                          \
4307       }                                                                 \
4308     else                                                                \
4309       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4310          must invoke it, or, at first, designate it to some graphic     \
4311          register.  Then repeat the loop to actually produce the        \
4312          character.  */                                                 \
4313       dst = encode_invocation_designation (charset, coding, dst,        \
4314                                            &produced_chars);            \
4315   } while (1)
4316
4317
4318 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4319   do {                                                                     \
4320     int code = ENCODE_CHAR ((charset),(c));                                \
4321                                                                            \
4322     if (CHARSET_DIMENSION (charset) == 1)                                  \
4323       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4324     else                                                                   \
4325       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4326   } while (0)
4327
4328
4329 /* Produce designation and invocation codes at a place pointed by DST
4330    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4331    Return new DST.  */
4332
4333 unsigned char *
4334 encode_invocation_designation (charset, coding, dst, p_nchars)
4335      struct charset *charset;
4336      struct coding_system *coding;
4337      unsigned char *dst;
4338      int *p_nchars;
4339 {
4340   int multibytep = coding->dst_multibyte;
4341   int produced_chars = *p_nchars;
4342   int reg;                      /* graphic register number */
4343   int id = CHARSET_ID (charset);
4344
4345   /* At first, check designations.  */
4346   for (reg = 0; reg < 4; reg++)
4347     if (id == CODING_ISO_DESIGNATION (coding, reg))
4348       break;
4349
4350   if (reg >= 4)
4351     {
4352       /* CHARSET is not yet designated to any graphic registers.  */
4353       /* At first check the requested designation.  */
4354       reg = CODING_ISO_REQUEST (coding, id);
4355       if (reg < 0)
4356         /* Since CHARSET requests no special designation, designate it
4357            to graphic register 0.  */
4358         reg = 0;
4359
4360       ENCODE_DESIGNATION (charset, reg, coding);
4361     }
4362
4363   if (CODING_ISO_INVOCATION (coding, 0) != reg
4364       && CODING_ISO_INVOCATION (coding, 1) != reg)
4365     {
4366       /* Since the graphic register REG is not invoked to any graphic
4367          planes, invoke it to graphic plane 0.  */
4368       switch (reg)
4369         {
4370         case 0:                 /* graphic register 0 */
4371           ENCODE_SHIFT_IN;
4372           break;
4373
4374         case 1:                 /* graphic register 1 */
4375           ENCODE_SHIFT_OUT;
4376           break;
4377
4378         case 2:                 /* graphic register 2 */
4379           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4380             ENCODE_SINGLE_SHIFT_2;
4381           else
4382             ENCODE_LOCKING_SHIFT_2;
4383           break;
4384
4385         case 3:                 /* graphic register 3 */
4386           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4387             ENCODE_SINGLE_SHIFT_3;
4388           else
4389             ENCODE_LOCKING_SHIFT_3;
4390           break;
4391         }
4392     }
4393
4394   *p_nchars = produced_chars;
4395   return dst;
4396 }
4397
4398 /* The following three macros produce codes for indicating direction
4399    of text.  */
4400 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4401   do {                                                                  \
4402     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4403       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4404     else                                                                \
4405       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4406   } while (0)
4407
4408
4409 #define ENCODE_DIRECTION_R2L()                  \
4410   do {                                          \
4411     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4412     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4413   } while (0)
4414
4415
4416 #define ENCODE_DIRECTION_L2R()                  \
4417   do {                                          \
4418     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4419     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4420   } while (0)
4421
4422
4423 /* Produce codes for designation and invocation to reset the graphic
4424    planes and registers to initial state.  */
4425 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4426   do {                                                                  \
4427     int reg;                                                            \
4428     struct charset *charset;                                            \
4429                                                                         \
4430     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4431       ENCODE_SHIFT_IN;                                                  \
4432     for (reg = 0; reg < 4; reg++)                                       \
4433       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4434           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4435               != CODING_ISO_INITIAL (coding, reg)))                     \
4436         {                                                               \
4437           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4438           ENCODE_DESIGNATION (charset, reg, coding);                    \
4439         }                                                               \
4440   } while (0)
4441
4442
4443 /* Produce designation sequences of charsets in the line started from
4444    SRC to a place pointed by DST, and return updated DST.
4445
4446    If the current block ends before any end-of-line, we may fail to
4447    find all the necessary designations.  */
4448
4449 static unsigned char *
4450 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4451      struct coding_system *coding;
4452      int *charbuf, *charbuf_end;
4453      unsigned char *dst;
4454 {
4455   struct charset *charset;
4456   /* Table of charsets to be designated to each graphic register.  */
4457   int r[4];
4458   int c, found = 0, reg;
4459   int produced_chars = 0;
4460   int multibytep = coding->dst_multibyte;
4461   Lisp_Object attrs;
4462   Lisp_Object charset_list;
4463
4464   attrs = CODING_ID_ATTRS (coding->id);
4465   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4466   if (EQ (charset_list, Qiso_2022))
4467     charset_list = Viso_2022_charset_list;
4468
4469   for (reg = 0; reg < 4; reg++)
4470     r[reg] = -1;
4471
4472   while (found < 4)
4473     {
4474       int id;
4475
4476       c = *charbuf++;
4477       if (c == '\n')
4478         break;
4479       charset = char_charset (c, charset_list, NULL);
4480       id = CHARSET_ID (charset);
4481       reg = CODING_ISO_REQUEST (coding, id);
4482       if (reg >= 0 && r[reg] < 0)
4483         {
4484           found++;
4485           r[reg] = id;
4486         }
4487     }
4488
4489   if (found)
4490     {
4491       for (reg = 0; reg < 4; reg++)
4492         if (r[reg] >= 0
4493             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4494           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4495     }
4496
4497   return dst;
4498 }
4499
4500 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4501
4502 static int
4503 encode_coding_iso_2022 (coding)
4504      struct coding_system *coding;
4505 {
4506   int multibytep = coding->dst_multibyte;
4507   int *charbuf = coding->charbuf;
4508   int *charbuf_end = charbuf + coding->charbuf_used;
4509   unsigned char *dst = coding->destination + coding->produced;
4510   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4511   int safe_room = 16;
4512   int bol_designation
4513     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4514        && CODING_ISO_BOL (coding));
4515   int produced_chars = 0;
4516   Lisp_Object attrs, eol_type, charset_list;
4517   int ascii_compatible;
4518   int c;
4519   int preferred_charset_id = -1;
4520
4521   CODING_GET_INFO (coding, attrs, charset_list);
4522   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4523   if (VECTORP (eol_type))
4524     eol_type = Qunix;
4525
4526   setup_iso_safe_charsets (attrs);
4527   /* Charset list may have been changed.  */
4528   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4529   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4530
4531   ascii_compatible
4532     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4533        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4534                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4535
4536   while (charbuf < charbuf_end)
4537     {
4538       ASSURE_DESTINATION (safe_room);
4539
4540       if (bol_designation)
4541         {
4542           unsigned char *dst_prev = dst;
4543
4544           /* We have to produce designation sequences if any now.  */
4545           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4546           bol_designation = 0;
4547           /* We are sure that designation sequences are all ASCII bytes.  */
4548           produced_chars += dst - dst_prev;
4549         }
4550
4551       c = *charbuf++;
4552
4553       if (c < 0)
4554         {
4555           /* Handle an annotation.  */
4556           switch (*charbuf)
4557             {
4558             case CODING_ANNOTATE_COMPOSITION_MASK:
4559               /* Not yet implemented.  */
4560               break;
4561             case CODING_ANNOTATE_CHARSET_MASK:
4562               preferred_charset_id = charbuf[2];
4563               if (preferred_charset_id >= 0
4564                   && NILP (Fmemq (make_number (preferred_charset_id),
4565                                   charset_list)))
4566                 preferred_charset_id = -1;
4567               break;
4568             default:
4569               abort ();
4570             }
4571           charbuf += -c - 1;
4572           continue;
4573         }
4574
4575       /* Now encode the character C.  */
4576       if (c < 0x20 || c == 0x7F)
4577         {
4578           if (c == '\n'
4579               || (c == '\r' && EQ (eol_type, Qmac)))
4580             {
4581               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4582                 ENCODE_RESET_PLANE_AND_REGISTER ();
4583               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4584                 {
4585                   int i;
4586
4587                   for (i = 0; i < 4; i++)
4588                     CODING_ISO_DESIGNATION (coding, i)
4589                       = CODING_ISO_INITIAL (coding, i);
4590                 }
4591               bol_designation
4592                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4593             }
4594           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4595             ENCODE_RESET_PLANE_AND_REGISTER ();
4596           EMIT_ONE_ASCII_BYTE (c);
4597         }
4598       else if (ASCII_CHAR_P (c))
4599         {
4600           if (ascii_compatible)
4601             EMIT_ONE_ASCII_BYTE (c);
4602           else
4603             {
4604               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4605               ENCODE_ISO_CHARACTER (charset, c);
4606             }
4607         }
4608       else if (CHAR_BYTE8_P (c))
4609         {
4610           c = CHAR_TO_BYTE8 (c);
4611           EMIT_ONE_BYTE (c);
4612         }
4613       else
4614         {
4615           struct charset *charset;
4616
4617           if (preferred_charset_id >= 0)
4618             {
4619               charset = CHARSET_FROM_ID (preferred_charset_id);
4620               if (! CHAR_CHARSET_P (c, charset))
4621                 charset = char_charset (c, charset_list, NULL);
4622             }
4623           else
4624             charset = char_charset (c, charset_list, NULL);
4625           if (!charset)
4626             {
4627               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4628                 {
4629                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4630                   charset = CHARSET_FROM_ID (charset_ascii);
4631                 }
4632               else
4633                 {
4634                   c = coding->default_char;
4635                   charset = char_charset (c, charset_list, NULL);
4636                 }
4637             }
4638           ENCODE_ISO_CHARACTER (charset, c);
4639         }
4640     }
4641
4642   if (coding->mode & CODING_MODE_LAST_BLOCK
4643       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4644     {
4645       ASSURE_DESTINATION (safe_room);
4646       ENCODE_RESET_PLANE_AND_REGISTER ();
4647     }
4648   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4649   CODING_ISO_BOL (coding) = bol_designation;
4650   coding->produced_char += produced_chars;
4651   coding->produced = dst - coding->destination;
4652   return 0;
4653 }
4654
4655 \f
4656 /*** 8,9. SJIS and BIG5 handlers ***/
4657
4658 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4659    quite widely.  So, for the moment, Emacs supports them in the bare
4660    C code.  But, in the future, they may be supported only by CCL.  */
4661
4662 /* SJIS is a coding system encoding three character sets: ASCII, right
4663    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4664    as is.  A character of charset katakana-jisx0201 is encoded by
4665    "position-code + 0x80".  A character of charset japanese-jisx0208
4666    is encoded in 2-byte but two position-codes are divided and shifted
4667    so that it fit in the range below.
4668
4669    --- CODE RANGE of SJIS ---
4670    (character set)      (range)
4671    ASCII                0x00 .. 0x7F
4672    KATAKANA-JISX0201    0xA0 .. 0xDF
4673    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4674             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4675    -------------------------------
4676
4677 */
4678
4679 /* BIG5 is a coding system encoding two character sets: ASCII and
4680    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4681    character set and is encoded in two-byte.
4682
4683    --- CODE RANGE of BIG5 ---
4684    (character set)      (range)
4685    ASCII                0x00 .. 0x7F
4686    Big5 (1st byte)      0xA1 .. 0xFE
4687         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4688    --------------------------
4689
4690   */
4691
4692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4693    Check if a text is encoded in SJIS.  If it is, return
4694    CATEGORY_MASK_SJIS, else return 0.  */
4695
4696 static int
4697 detect_coding_sjis (coding, detect_info)
4698      struct coding_system *coding;
4699      struct coding_detection_info *detect_info;
4700 {
4701   const unsigned char *src = coding->source, *src_base;
4702   const unsigned char *src_end = coding->source + coding->src_bytes;
4703   int multibytep = coding->src_multibyte;
4704   int consumed_chars = 0;
4705   int found = 0;
4706   int c;
4707   Lisp_Object attrs, charset_list;
4708   int max_first_byte_of_2_byte_code;
4709
4710   CODING_GET_INFO (coding, attrs, charset_list);
4711   max_first_byte_of_2_byte_code
4712     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4713
4714   detect_info->checked |= CATEGORY_MASK_SJIS;
4715   /* A coding system of this category is always ASCII compatible.  */
4716   src += coding->head_ascii;
4717
4718   while (1)
4719     {
4720       src_base = src;
4721       ONE_MORE_BYTE (c);
4722       if (c < 0x80)
4723         continue;
4724       if ((c >= 0x81 && c <= 0x9F)
4725           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4726         {
4727           ONE_MORE_BYTE (c);
4728           if (c < 0x40 || c == 0x7F || c > 0xFC)
4729             break;
4730           found = CATEGORY_MASK_SJIS;
4731         }
4732       else if (c >= 0xA0 && c < 0xE0)
4733         found = CATEGORY_MASK_SJIS;
4734       else
4735         break;
4736     }
4737   detect_info->rejected |= CATEGORY_MASK_SJIS;
4738   return 0;
4739
4740  no_more_source:
4741   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4742     {
4743       detect_info->rejected |= CATEGORY_MASK_SJIS;
4744       return 0;
4745     }
4746   detect_info->found |= found;
4747   return 1;
4748 }
4749
4750 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4751    Check if a text is encoded in BIG5.  If it is, return
4752    CATEGORY_MASK_BIG5, else return 0.  */
4753
4754 static int
4755 detect_coding_big5 (coding, detect_info)
4756      struct coding_system *coding;
4757      struct coding_detection_info *detect_info;
4758 {
4759   const unsigned char *src = coding->source, *src_base;
4760   const unsigned char *src_end = coding->source + coding->src_bytes;
4761   int multibytep = coding->src_multibyte;
4762   int consumed_chars = 0;
4763   int found = 0;
4764   int c;
4765
4766   detect_info->checked |= CATEGORY_MASK_BIG5;
4767   /* A coding system of this category is always ASCII compatible.  */
4768   src += coding->head_ascii;
4769
4770   while (1)
4771     {
4772       src_base = src;
4773       ONE_MORE_BYTE (c);
4774       if (c < 0x80)
4775         continue;
4776       if (c >= 0xA1)
4777         {
4778           ONE_MORE_BYTE (c);
4779           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4780             return 0;
4781           found = CATEGORY_MASK_BIG5;
4782         }
4783       else
4784         break;
4785     }
4786   detect_info->rejected |= CATEGORY_MASK_BIG5;
4787   return 0;
4788
4789  no_more_source:
4790   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4791     {
4792       detect_info->rejected |= CATEGORY_MASK_BIG5;
4793       return 0;
4794     }
4795   detect_info->found |= found;
4796   return 1;
4797 }
4798
4799 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4800    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4801
4802 static void
4803 decode_coding_sjis (coding)
4804      struct coding_system *coding;
4805 {
4806   const unsigned char *src = coding->source + coding->consumed;
4807   const unsigned char *src_end = coding->source + coding->src_bytes;
4808   const unsigned char *src_base;
4809   int *charbuf = coding->charbuf + coding->charbuf_used;
4810   /* We may produce one charset annocation in one loop and one more at
4811      the end.  */
4812   int *charbuf_end
4813     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4814   int consumed_chars = 0, consumed_chars_base;
4815   int multibytep = coding->src_multibyte;
4816   struct charset *charset_roman, *charset_kanji, *charset_kana;
4817   struct charset *charset_kanji2;
4818   Lisp_Object attrs, charset_list, val;
4819   int char_offset = coding->produced_char;
4820   int last_offset = char_offset;
4821   int last_id = charset_ascii;
4822   int eol_crlf =
4823     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4824   int byte_after_cr = -1;
4825
4826   CODING_GET_INFO (coding, attrs, charset_list);
4827
4828   val = charset_list;
4829   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4830   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4831   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4832   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4833
4834   while (1)
4835     {
4836       int c, c1;
4837       struct charset *charset;
4838
4839       src_base = src;
4840       consumed_chars_base = consumed_chars;
4841
4842       if (charbuf >= charbuf_end)
4843         {
4844           if (byte_after_cr >= 0)
4845             src_base--;
4846           break;
4847         }
4848
4849       if (byte_after_cr >= 0)
4850         c = byte_after_cr, byte_after_cr = -1;
4851       else
4852         ONE_MORE_BYTE (c);
4853       if (c < 0)
4854         goto invalid_code;
4855       if (c < 0x80)
4856         {
4857           if (eol_crlf && c == '\r')
4858             ONE_MORE_BYTE (byte_after_cr);
4859           charset = charset_roman;
4860         }
4861       else if (c == 0x80 || c == 0xA0)
4862         goto invalid_code;
4863       else if (c >= 0xA1 && c <= 0xDF)
4864         {
4865           /* SJIS -> JISX0201-Kana */
4866           c &= 0x7F;
4867           charset = charset_kana;
4868         }
4869       else if (c <= 0xEF)
4870         {
4871           /* SJIS -> JISX0208 */
4872           ONE_MORE_BYTE (c1);
4873           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4874             goto invalid_code;
4875           c = (c << 8) | c1;
4876           SJIS_TO_JIS (c);
4877           charset = charset_kanji;
4878         }
4879       else if (c <= 0xFC && charset_kanji2)
4880         {
4881           /* SJIS -> JISX0213-2 */
4882           ONE_MORE_BYTE (c1);
4883           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4884             goto invalid_code;
4885           c = (c << 8) | c1;
4886           SJIS_TO_JIS2 (c);
4887           charset = charset_kanji2;
4888         }
4889       else
4890         goto invalid_code;
4891       if (charset->id != charset_ascii
4892           && last_id != charset->id)
4893         {
4894           if (last_id != charset_ascii)
4895             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4896           last_id = charset->id;
4897           last_offset = char_offset;
4898         }
4899       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4900       *charbuf++ = c;
4901       char_offset++;
4902       continue;
4903
4904     invalid_code:
4905       src = src_base;
4906       consumed_chars = consumed_chars_base;
4907       ONE_MORE_BYTE (c);
4908       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4909       char_offset++;
4910       coding->errors++;
4911     }
4912
4913  no_more_source:
4914   if (last_id != charset_ascii)
4915     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4916   coding->consumed_char += consumed_chars_base;
4917   coding->consumed = src_base - coding->source;
4918   coding->charbuf_used = charbuf - coding->charbuf;
4919 }
4920
4921 static void
4922 decode_coding_big5 (coding)
4923      struct coding_system *coding;
4924 {
4925   const unsigned char *src = coding->source + coding->consumed;
4926   const unsigned char *src_end = coding->source + coding->src_bytes;
4927   const unsigned char *src_base;
4928   int *charbuf = coding->charbuf + coding->charbuf_used;
4929   /* We may produce one charset annocation in one loop and one more at
4930      the end.  */
4931   int *charbuf_end
4932     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4933   int consumed_chars = 0, consumed_chars_base;
4934   int multibytep = coding->src_multibyte;
4935   struct charset *charset_roman, *charset_big5;
4936   Lisp_Object attrs, charset_list, val;
4937   int char_offset = coding->produced_char;
4938   int last_offset = char_offset;
4939   int last_id = charset_ascii;
4940   int eol_crlf =
4941     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4942   int byte_after_cr = -1;
4943
4944   CODING_GET_INFO (coding, attrs, charset_list);
4945   val = charset_list;
4946   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4947   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4948
4949   while (1)
4950     {
4951       int c, c1;
4952       struct charset *charset;
4953
4954       src_base = src;
4955       consumed_chars_base = consumed_chars;
4956
4957       if (charbuf >= charbuf_end)
4958         {
4959           if (byte_after_cr >= 0)
4960             src_base--;
4961           break;
4962         }
4963
4964       if (byte_after_cr >= 0)
4965         c = byte_after_cr, byte_after_cr = -1;
4966       else
4967         ONE_MORE_BYTE (c);
4968
4969       if (c < 0)
4970         goto invalid_code;
4971       if (c < 0x80)
4972         {
4973           if (eol_crlf && c == '\r')
4974             ONE_MORE_BYTE (byte_after_cr);
4975           charset = charset_roman;
4976         }
4977       else
4978         {
4979           /* BIG5 -> Big5 */
4980           if (c < 0xA1 || c > 0xFE)
4981             goto invalid_code;
4982           ONE_MORE_BYTE (c1);
4983           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4984             goto invalid_code;
4985           c = c << 8 | c1;
4986           charset = charset_big5;
4987         }
4988       if (charset->id != charset_ascii
4989           && last_id != charset->id)
4990         {
4991           if (last_id != charset_ascii)
4992             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4993           last_id = charset->id;
4994           last_offset = char_offset;
4995         }
4996       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4997       *charbuf++ = c;
4998       char_offset++;
4999       continue;
5000
5001     invalid_code:
5002       src = src_base;
5003       consumed_chars = consumed_chars_base;
5004       ONE_MORE_BYTE (c);
5005       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
5006       char_offset++;
5007       coding->errors++;
5008     }
5009
5010  no_more_source:
5011   if (last_id != charset_ascii)
5012     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5013   coding->consumed_char += consumed_chars_base;
5014   coding->consumed = src_base - coding->source;
5015   coding->charbuf_used = charbuf - coding->charbuf;
5016 }
5017
5018 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
5019    This function can encode charsets `ascii', `katakana-jisx0201',
5020    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
5021    are sure that all these charsets are registered as official charset
5022    (i.e. do not have extended leading-codes).  Characters of other
5023    charsets are produced without any encoding.  If SJIS_P is 1, encode
5024    SJIS text, else encode BIG5 text.  */
5025
5026 static int
5027 encode_coding_sjis (coding)
5028      struct coding_system *coding;
5029 {
5030   int multibytep = coding->dst_multibyte;
5031   int *charbuf = coding->charbuf;
5032   int *charbuf_end = charbuf + coding->charbuf_used;
5033   unsigned char *dst = coding->destination + coding->produced;
5034   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5035   int safe_room = 4;
5036   int produced_chars = 0;
5037   Lisp_Object attrs, charset_list, val;
5038   int ascii_compatible;
5039   struct charset *charset_roman, *charset_kanji, *charset_kana;
5040   struct charset *charset_kanji2;
5041   int c;
5042
5043   CODING_GET_INFO (coding, attrs, charset_list);
5044   val = charset_list;
5045   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5046   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5047   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5048   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5049
5050   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5051
5052   while (charbuf < charbuf_end)
5053     {
5054       ASSURE_DESTINATION (safe_room);
5055       c = *charbuf++;
5056       /* Now encode the character C.  */
5057       if (ASCII_CHAR_P (c) && ascii_compatible)
5058         EMIT_ONE_ASCII_BYTE (c);
5059       else if (CHAR_BYTE8_P (c))
5060         {
5061           c = CHAR_TO_BYTE8 (c);
5062           EMIT_ONE_BYTE (c);
5063         }
5064       else
5065         {
5066           unsigned code;
5067           struct charset *charset = char_charset (c, charset_list, &code);
5068
5069           if (!charset)
5070             {
5071               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5072                 {
5073                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5074                   charset = CHARSET_FROM_ID (charset_ascii);
5075                 }
5076               else
5077                 {
5078                   c = coding->default_char;
5079                   charset = char_charset (c, charset_list, &code);
5080                 }
5081             }
5082           if (code == CHARSET_INVALID_CODE (charset))
5083             abort ();
5084           if (charset == charset_kanji)
5085             {
5086               int c1, c2;
5087               JIS_TO_SJIS (code);
5088               c1 = code >> 8, c2 = code & 0xFF;
5089               EMIT_TWO_BYTES (c1, c2);
5090             }
5091           else if (charset == charset_kana)
5092             EMIT_ONE_BYTE (code | 0x80);
5093           else if (charset_kanji2 && charset == charset_kanji2)
5094             {
5095               int c1, c2;
5096
5097               c1 = code >> 8;
5098               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5099                   || c1 == 0x28
5100                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5101                 {
5102                   JIS_TO_SJIS2 (code);
5103                   c1 = code >> 8, c2 = code & 0xFF;
5104                   EMIT_TWO_BYTES (c1, c2);
5105                 }
5106               else
5107                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5108             }
5109           else
5110             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5111         }
5112     }
5113   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5114   coding->produced_char += produced_chars;
5115   coding->produced = dst - coding->destination;
5116   return 0;
5117 }
5118
5119 static int
5120 encode_coding_big5 (coding)
5121      struct coding_system *coding;
5122 {
5123   int multibytep = coding->dst_multibyte;
5124   int *charbuf = coding->charbuf;
5125   int *charbuf_end = charbuf + coding->charbuf_used;
5126   unsigned char *dst = coding->destination + coding->produced;
5127   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5128   int safe_room = 4;
5129   int produced_chars = 0;
5130   Lisp_Object attrs, charset_list, val;
5131   int ascii_compatible;
5132   struct charset *charset_roman, *charset_big5;
5133   int c;
5134
5135   CODING_GET_INFO (coding, attrs, charset_list);
5136   val = charset_list;
5137   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5138   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5139   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5140
5141   while (charbuf < charbuf_end)
5142     {
5143       ASSURE_DESTINATION (safe_room);
5144       c = *charbuf++;
5145       /* Now encode the character C.  */
5146       if (ASCII_CHAR_P (c) && ascii_compatible)
5147         EMIT_ONE_ASCII_BYTE (c);
5148       else if (CHAR_BYTE8_P (c))
5149         {
5150           c = CHAR_TO_BYTE8 (c);
5151           EMIT_ONE_BYTE (c);
5152         }
5153       else
5154         {
5155           unsigned code;
5156           struct charset *charset = char_charset (c, charset_list, &code);
5157
5158           if (! charset)
5159             {
5160               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5161                 {
5162                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5163                   charset = CHARSET_FROM_ID (charset_ascii);
5164                 }
5165               else
5166                 {
5167                   c = coding->default_char;
5168                   charset = char_charset (c, charset_list, &code);
5169                 }
5170             }
5171           if (code == CHARSET_INVALID_CODE (charset))
5172             abort ();
5173           if (charset == charset_big5)
5174             {
5175               int c1, c2;
5176
5177               c1 = code >> 8, c2 = code & 0xFF;
5178               EMIT_TWO_BYTES (c1, c2);
5179             }
5180           else
5181             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5182         }
5183     }
5184   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5185   coding->produced_char += produced_chars;
5186   coding->produced = dst - coding->destination;
5187   return 0;
5188 }
5189
5190 \f
5191 /*** 10. CCL handlers ***/
5192
5193 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5194    Check if a text is encoded in a coding system of which
5195    encoder/decoder are written in CCL program.  If it is, return
5196    CATEGORY_MASK_CCL, else return 0.  */
5197
5198 static int
5199 detect_coding_ccl (coding, detect_info)
5200      struct coding_system *coding;
5201      struct coding_detection_info *detect_info;
5202 {
5203   const unsigned char *src = coding->source, *src_base;
5204   const unsigned char *src_end = coding->source + coding->src_bytes;
5205   int multibytep = coding->src_multibyte;
5206   int consumed_chars = 0;
5207   int found = 0;
5208   unsigned char *valids;
5209   int head_ascii = coding->head_ascii;
5210   Lisp_Object attrs;
5211
5212   detect_info->checked |= CATEGORY_MASK_CCL;
5213
5214   coding = &coding_categories[coding_category_ccl];
5215   valids = CODING_CCL_VALIDS (coding);
5216   attrs = CODING_ID_ATTRS (coding->id);
5217   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5218     src += head_ascii;
5219
5220   while (1)
5221     {
5222       int c;
5223
5224       src_base = src;
5225       ONE_MORE_BYTE (c);
5226       if (c < 0 || ! valids[c])
5227         break;
5228       if ((valids[c] > 1))
5229         found = CATEGORY_MASK_CCL;
5230     }
5231   detect_info->rejected |= CATEGORY_MASK_CCL;
5232   return 0;
5233
5234  no_more_source:
5235   detect_info->found |= found;
5236   return 1;
5237 }
5238
5239 static void
5240 decode_coding_ccl (coding)
5241      struct coding_system *coding;
5242 {
5243   const unsigned char *src = coding->source + coding->consumed;
5244   const unsigned char *src_end = coding->source + coding->src_bytes;
5245   int *charbuf = coding->charbuf + coding->charbuf_used;
5246   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5247   int consumed_chars = 0;
5248   int multibytep = coding->src_multibyte;
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   int source_charbuf[1024];
5251   int source_byteidx[1025];
5252   Lisp_Object attrs, charset_list;
5253
5254   CODING_GET_INFO (coding, attrs, charset_list);
5255
5256   while (1)
5257     {
5258       const unsigned char *p = src;
5259       int i = 0;
5260
5261       if (multibytep)
5262         {
5263           while (i < 1024 && p < src_end)
5264             {
5265               source_byteidx[i] = p - src;
5266               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5267             }
5268           source_byteidx[i] = p - src;
5269         }
5270       else
5271         while (i < 1024 && p < src_end)
5272           source_charbuf[i++] = *p++;
5273
5274       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5275         ccl->last_block = 1;
5276       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5277                   charset_list);
5278       charbuf += ccl->produced;
5279       if (multibytep)
5280         src += source_byteidx[ccl->consumed];
5281       else
5282         src += ccl->consumed;
5283       consumed_chars += ccl->consumed;
5284       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5285         break;
5286     }
5287
5288   switch (ccl->status)
5289     {
5290     case CCL_STAT_SUSPEND_BY_SRC:
5291       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5292       break;
5293     case CCL_STAT_SUSPEND_BY_DST:
5294       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5295       break;
5296     case CCL_STAT_QUIT:
5297     case CCL_STAT_INVALID_CMD:
5298       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5299       break;
5300     default:
5301       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5302       break;
5303     }
5304   coding->consumed_char += consumed_chars;
5305   coding->consumed = src - coding->source;
5306   coding->charbuf_used = charbuf - coding->charbuf;
5307 }
5308
5309 static int
5310 encode_coding_ccl (coding)
5311      struct coding_system *coding;
5312 {
5313   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5314   int multibytep = coding->dst_multibyte;
5315   int *charbuf = coding->charbuf;
5316   int *charbuf_end = charbuf + coding->charbuf_used;
5317   unsigned char *dst = coding->destination + coding->produced;
5318   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5319   int destination_charbuf[1024];
5320   int i, produced_chars = 0;
5321   Lisp_Object attrs, charset_list;
5322
5323   CODING_GET_INFO (coding, attrs, charset_list);
5324   if (coding->consumed_char == coding->src_chars
5325       && coding->mode & CODING_MODE_LAST_BLOCK)
5326     ccl->last_block = 1;
5327
5328   while (charbuf < charbuf_end)
5329     {
5330       ccl_driver (ccl, charbuf, destination_charbuf,
5331                   charbuf_end - charbuf, 1024, charset_list);
5332       if (multibytep)
5333         {
5334           ASSURE_DESTINATION (ccl->produced * 2);
5335           for (i = 0; i < ccl->produced; i++)
5336             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5337         }
5338       else
5339         {
5340           ASSURE_DESTINATION (ccl->produced);
5341           for (i = 0; i < ccl->produced; i++)
5342             *dst++ = destination_charbuf[i] & 0xFF;
5343           produced_chars += ccl->produced;
5344         }
5345       charbuf += ccl->consumed;
5346       if (ccl->status == CCL_STAT_QUIT
5347           || ccl->status == CCL_STAT_INVALID_CMD)
5348         break;
5349     }
5350
5351   switch (ccl->status)
5352     {
5353     case CCL_STAT_SUSPEND_BY_SRC:
5354       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5355       break;
5356     case CCL_STAT_SUSPEND_BY_DST:
5357       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5358       break;
5359     case CCL_STAT_QUIT:
5360     case CCL_STAT_INVALID_CMD:
5361       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5362       break;
5363     default:
5364       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5365       break;
5366     }
5367
5368   coding->produced_char += produced_chars;
5369   coding->produced = dst - coding->destination;
5370   return 0;
5371 }
5372
5373
5374 \f
5375 /*** 10, 11. no-conversion handlers ***/
5376
5377 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5378
5379 static void
5380 decode_coding_raw_text (coding)
5381      struct coding_system *coding;
5382 {
5383   int eol_crlf =
5384     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5385
5386   coding->chars_at_source = 1;
5387   coding->consumed_char = coding->src_chars;
5388   coding->consumed = coding->src_bytes;
5389   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5390     {
5391       coding->consumed_char--;
5392       coding->consumed--;
5393       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5394     }
5395   else
5396     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5397 }
5398
5399 static int
5400 encode_coding_raw_text (coding)
5401      struct coding_system *coding;
5402 {
5403   int multibytep = coding->dst_multibyte;
5404   int *charbuf = coding->charbuf;
5405   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5406   unsigned char *dst = coding->destination + coding->produced;
5407   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5408   int produced_chars = 0;
5409   int c;
5410
5411   if (multibytep)
5412     {
5413       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5414
5415       if (coding->src_multibyte)
5416         while (charbuf < charbuf_end)
5417           {
5418             ASSURE_DESTINATION (safe_room);
5419             c = *charbuf++;
5420             if (ASCII_CHAR_P (c))
5421               EMIT_ONE_ASCII_BYTE (c);
5422             else if (CHAR_BYTE8_P (c))
5423               {
5424                 c = CHAR_TO_BYTE8 (c);
5425                 EMIT_ONE_BYTE (c);
5426               }
5427             else
5428               {
5429                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5430
5431                 CHAR_STRING_ADVANCE (c, p1);
5432                 while (p0 < p1)
5433                   {
5434                     EMIT_ONE_BYTE (*p0);
5435                     p0++;
5436                   }
5437               }
5438           }
5439       else
5440         while (charbuf < charbuf_end)
5441           {
5442             ASSURE_DESTINATION (safe_room);
5443             c = *charbuf++;
5444             EMIT_ONE_BYTE (c);
5445           }
5446     }
5447   else
5448     {
5449       if (coding->src_multibyte)
5450         {
5451           int safe_room = MAX_MULTIBYTE_LENGTH;
5452
5453           while (charbuf < charbuf_end)
5454             {
5455               ASSURE_DESTINATION (safe_room);
5456               c = *charbuf++;
5457               if (ASCII_CHAR_P (c))
5458                 *dst++ = c;
5459               else if (CHAR_BYTE8_P (c))
5460                 *dst++ = CHAR_TO_BYTE8 (c);
5461               else
5462                 CHAR_STRING_ADVANCE (c, dst);
5463             }
5464         }
5465       else
5466         {
5467           ASSURE_DESTINATION (charbuf_end - charbuf);
5468           while (charbuf < charbuf_end && dst < dst_end)
5469             *dst++ = *charbuf++;
5470         }
5471       produced_chars = dst - (coding->destination + coding->produced);
5472     }
5473   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5474   coding->produced_char += produced_chars;
5475   coding->produced = dst - coding->destination;
5476   return 0;
5477 }
5478
5479 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5480    Check if a text is encoded in a charset-based coding system.  If it
5481    is, return 1, else return 0.  */
5482
5483 static int
5484 detect_coding_charset (coding, detect_info)
5485      struct coding_system *coding;
5486      struct coding_detection_info *detect_info;
5487 {
5488   const unsigned char *src = coding->source, *src_base;
5489   const unsigned char *src_end = coding->source + coding->src_bytes;
5490   int multibytep = coding->src_multibyte;
5491   int consumed_chars = 0;
5492   Lisp_Object attrs, valids, name;
5493   int found = 0;
5494   int head_ascii = coding->head_ascii;
5495   int check_latin_extra = 0;
5496
5497   detect_info->checked |= CATEGORY_MASK_CHARSET;
5498
5499   coding = &coding_categories[coding_category_charset];
5500   attrs = CODING_ID_ATTRS (coding->id);
5501   valids = AREF (attrs, coding_attr_charset_valids);
5502   name = CODING_ID_NAME (coding->id);
5503   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5504                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5505       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5506                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5507     check_latin_extra = 1;
5508
5509   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5510     src += head_ascii;
5511
5512   while (1)
5513     {
5514       int c;
5515       Lisp_Object val;
5516       struct charset *charset;
5517       int dim, idx;
5518
5519       src_base = src;
5520       ONE_MORE_BYTE (c);
5521       if (c < 0)
5522         continue;
5523       val = AREF (valids, c);
5524       if (NILP (val))
5525         break;
5526       if (c >= 0x80)
5527         {
5528           if (c < 0xA0
5529               && check_latin_extra
5530               && (!VECTORP (Vlatin_extra_code_table)
5531                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5532             break;
5533           found = CATEGORY_MASK_CHARSET;
5534         }
5535       if (INTEGERP (val))
5536         {
5537           charset = CHARSET_FROM_ID (XFASTINT (val));
5538           dim = CHARSET_DIMENSION (charset);
5539           for (idx = 1; idx < dim; idx++)
5540             {
5541               if (src == src_end)
5542                 goto too_short;
5543               ONE_MORE_BYTE (c);
5544               if (c < charset->code_space[(dim - 1 - idx) * 2]
5545                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5546                 break;
5547             }
5548           if (idx < dim)
5549             break;
5550         }
5551       else
5552         {
5553           idx = 1;
5554           for (; CONSP (val); val = XCDR (val))
5555             {
5556               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5557               dim = CHARSET_DIMENSION (charset);
5558               while (idx < dim)
5559                 {
5560                   if (src == src_end)
5561                     goto too_short;
5562                   ONE_MORE_BYTE (c);
5563                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5564                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5565                     break;
5566                   idx++;
5567                 }
5568               if (idx == dim)
5569                 {
5570                   val = Qnil;
5571                   break;
5572                 }
5573             }
5574           if (CONSP (val))
5575             break;
5576         }
5577     }
5578  too_short:
5579   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5580   return 0;
5581
5582  no_more_source:
5583   detect_info->found |= found;
5584   return 1;
5585 }
5586
5587 static void
5588 decode_coding_charset (coding)
5589      struct coding_system *coding;
5590 {
5591   const unsigned char *src = coding->source + coding->consumed;
5592   const unsigned char *src_end = coding->source + coding->src_bytes;
5593   const unsigned char *src_base;
5594   int *charbuf = coding->charbuf + coding->charbuf_used;
5595   /* We may produce one charset annocation in one loop and one more at
5596      the end.  */
5597   int *charbuf_end
5598     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5599   int consumed_chars = 0, consumed_chars_base;
5600   int multibytep = coding->src_multibyte;
5601   Lisp_Object attrs, charset_list, valids;
5602   int char_offset = coding->produced_char;
5603   int last_offset = char_offset;
5604   int last_id = charset_ascii;
5605   int eol_crlf =
5606     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5607   int byte_after_cr = -1;
5608
5609   CODING_GET_INFO (coding, attrs, charset_list);
5610   valids = AREF (attrs, coding_attr_charset_valids);
5611
5612   while (1)
5613     {
5614       int c;
5615       Lisp_Object val;
5616       struct charset *charset;
5617       int dim;
5618       int len = 1;
5619       unsigned code;
5620
5621       src_base = src;
5622       consumed_chars_base = consumed_chars;
5623
5624       if (charbuf >= charbuf_end)
5625         {
5626           if (byte_after_cr >= 0)
5627             src_base--;
5628           break;
5629         }
5630
5631       if (byte_after_cr >= 0)
5632         {
5633           c = byte_after_cr;
5634           byte_after_cr = -1;
5635         }
5636       else
5637         {
5638           ONE_MORE_BYTE (c);
5639           if (eol_crlf && c == '\r')
5640             ONE_MORE_BYTE (byte_after_cr);
5641         }
5642       if (c < 0)
5643         goto invalid_code;
5644       code = c;
5645
5646       val = AREF (valids, c);
5647       if (! INTEGERP (val) && ! CONSP (val))
5648         goto invalid_code;
5649       if (INTEGERP (val))
5650         {
5651           charset = CHARSET_FROM_ID (XFASTINT (val));
5652           dim = CHARSET_DIMENSION (charset);
5653           while (len < dim)
5654             {
5655               ONE_MORE_BYTE (c);
5656               code = (code << 8) | c;
5657               len++;
5658             }
5659           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5660                               charset, code, c);
5661         }
5662       else
5663         {
5664           /* VAL is a list of charset IDs.  It is assured that the
5665              list is sorted by charset dimensions (smaller one
5666              comes first).  */
5667           while (CONSP (val))
5668             {
5669               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5670               dim = CHARSET_DIMENSION (charset);
5671               while (len < dim)
5672                 {
5673                   ONE_MORE_BYTE (c);
5674                   code = (code << 8) | c;
5675                   len++;
5676                 }
5677               CODING_DECODE_CHAR (coding, src, src_base,
5678                                   src_end, charset, code, c);
5679               if (c >= 0)
5680                 break;
5681               val = XCDR (val);
5682             }
5683         }
5684       if (c < 0)
5685         goto invalid_code;
5686       if (charset->id != charset_ascii
5687           && last_id != charset->id)
5688         {
5689           if (last_id != charset_ascii)
5690             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5691           last_id = charset->id;
5692           last_offset = char_offset;
5693         }
5694
5695       *charbuf++ = c;
5696       char_offset++;
5697       continue;
5698
5699     invalid_code:
5700       src = src_base;
5701       consumed_chars = consumed_chars_base;
5702       ONE_MORE_BYTE (c);
5703       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5704       char_offset++;
5705       coding->errors++;
5706     }
5707
5708  no_more_source:
5709   if (last_id != charset_ascii)
5710     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5711   coding->consumed_char += consumed_chars_base;
5712   coding->consumed = src_base - coding->source;
5713   coding->charbuf_used = charbuf - coding->charbuf;
5714 }
5715
5716 static int
5717 encode_coding_charset (coding)
5718      struct coding_system *coding;
5719 {
5720   int multibytep = coding->dst_multibyte;
5721   int *charbuf = coding->charbuf;
5722   int *charbuf_end = charbuf + coding->charbuf_used;
5723   unsigned char *dst = coding->destination + coding->produced;
5724   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5725   int safe_room = MAX_MULTIBYTE_LENGTH;
5726   int produced_chars = 0;
5727   Lisp_Object attrs, charset_list;
5728   int ascii_compatible;
5729   int c;
5730
5731   CODING_GET_INFO (coding, attrs, charset_list);
5732   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5733
5734   while (charbuf < charbuf_end)
5735     {
5736       struct charset *charset;
5737       unsigned code;
5738
5739       ASSURE_DESTINATION (safe_room);
5740       c = *charbuf++;
5741       if (ascii_compatible && ASCII_CHAR_P (c))
5742         EMIT_ONE_ASCII_BYTE (c);
5743       else if (CHAR_BYTE8_P (c))
5744         {
5745           c = CHAR_TO_BYTE8 (c);
5746           EMIT_ONE_BYTE (c);
5747         }
5748       else
5749         {
5750           charset = char_charset (c, charset_list, &code);
5751           if (charset)
5752             {
5753               if (CHARSET_DIMENSION (charset) == 1)
5754                 EMIT_ONE_BYTE (code);
5755               else if (CHARSET_DIMENSION (charset) == 2)
5756                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5757               else if (CHARSET_DIMENSION (charset) == 3)
5758                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5759               else
5760                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5761                                  (code >> 8) & 0xFF, code & 0xFF);
5762             }
5763           else
5764             {
5765               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5766                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5767               else
5768                 c = coding->default_char;
5769               EMIT_ONE_BYTE (c);
5770             }
5771         }
5772     }
5773
5774   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5775   coding->produced_char += produced_chars;
5776   coding->produced = dst - coding->destination;
5777   return 0;
5778 }
5779
5780 \f
5781 /*** 7. C library functions ***/
5782
5783 /* Setup coding context CODING from information about CODING_SYSTEM.
5784    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5785    CODING_SYSTEM is invalid, signal an error.  */
5786
5787 void
5788 setup_coding_system (coding_system, coding)
5789      Lisp_Object coding_system;
5790      struct coding_system *coding;
5791 {
5792   Lisp_Object attrs;
5793   Lisp_Object eol_type;
5794   Lisp_Object coding_type;
5795   Lisp_Object val;
5796
5797   if (NILP (coding_system))
5798     coding_system = Qundecided;
5799
5800   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5801
5802   attrs = CODING_ID_ATTRS (coding->id);
5803   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5804
5805   coding->mode = 0;
5806   coding->head_ascii = -1;
5807   if (VECTORP (eol_type))
5808     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5809                             | CODING_REQUIRE_DETECTION_MASK);
5810   else if (! EQ (eol_type, Qunix))
5811     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5812                             | CODING_REQUIRE_ENCODING_MASK);
5813   else
5814     coding->common_flags = 0;
5815   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5816     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5817   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5818     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5819   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5820     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5821
5822   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5823   coding->max_charset_id = SCHARS (val) - 1;
5824   coding->safe_charsets = SDATA (val);
5825   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5826   coding->carryover_bytes = 0;
5827
5828   coding_type = CODING_ATTR_TYPE (attrs);
5829   if (EQ (coding_type, Qundecided))
5830     {
5831       coding->detector = NULL;
5832       coding->decoder = decode_coding_raw_text;
5833       coding->encoder = encode_coding_raw_text;
5834       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5835     }
5836   else if (EQ (coding_type, Qiso_2022))
5837     {
5838       int i;
5839       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5840
5841       /* Invoke graphic register 0 to plane 0.  */
5842       CODING_ISO_INVOCATION (coding, 0) = 0;
5843       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5844       CODING_ISO_INVOCATION (coding, 1)
5845         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5846       /* Setup the initial status of designation.  */
5847       for (i = 0; i < 4; i++)
5848         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5849       /* Not single shifting initially.  */
5850       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5851       /* Beginning of buffer should also be regarded as bol. */
5852       CODING_ISO_BOL (coding) = 1;
5853       coding->detector = detect_coding_iso_2022;
5854       coding->decoder = decode_coding_iso_2022;
5855       coding->encoder = encode_coding_iso_2022;
5856       if (flags & CODING_ISO_FLAG_SAFE)
5857         coding->mode |= CODING_MODE_SAFE_ENCODING;
5858       coding->common_flags
5859         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5860             | CODING_REQUIRE_FLUSHING_MASK);
5861       if (flags & CODING_ISO_FLAG_COMPOSITION)
5862         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5863       if (flags & CODING_ISO_FLAG_DESIGNATION)
5864         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5865       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5866         {
5867           setup_iso_safe_charsets (attrs);
5868           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5869           coding->max_charset_id = SCHARS (val) - 1;
5870           coding->safe_charsets = SDATA (val);
5871         }
5872       CODING_ISO_FLAGS (coding) = flags;
5873       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5874       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5875       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5876       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5877     }
5878   else if (EQ (coding_type, Qcharset))
5879     {
5880       coding->detector = detect_coding_charset;
5881       coding->decoder = decode_coding_charset;
5882       coding->encoder = encode_coding_charset;
5883       coding->common_flags
5884         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5885     }
5886   else if (EQ (coding_type, Qutf_8))
5887     {
5888       val = AREF (attrs, coding_attr_utf_bom);
5889       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5890                                    : EQ (val, Qt) ? utf_with_bom
5891                                    : utf_without_bom);
5892       coding->detector = detect_coding_utf_8;
5893       coding->decoder = decode_coding_utf_8;
5894       coding->encoder = encode_coding_utf_8;
5895       coding->common_flags
5896         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5897       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5898         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5899     }
5900   else if (EQ (coding_type, Qutf_16))
5901     {
5902       val = AREF (attrs, coding_attr_utf_bom);
5903       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5904                                     : EQ (val, Qt) ? utf_with_bom
5905                                     : utf_without_bom);
5906       val = AREF (attrs, coding_attr_utf_16_endian);
5907       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5908                                        : utf_16_little_endian);
5909       CODING_UTF_16_SURROGATE (coding) = 0;
5910       coding->detector = detect_coding_utf_16;
5911       coding->decoder = decode_coding_utf_16;
5912       coding->encoder = encode_coding_utf_16;
5913       coding->common_flags
5914         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5915       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5916         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5917     }
5918   else if (EQ (coding_type, Qccl))
5919     {
5920       coding->detector = detect_coding_ccl;
5921       coding->decoder = decode_coding_ccl;
5922       coding->encoder = encode_coding_ccl;
5923       coding->common_flags
5924         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5925             | CODING_REQUIRE_FLUSHING_MASK);
5926     }
5927   else if (EQ (coding_type, Qemacs_mule))
5928     {
5929       coding->detector = detect_coding_emacs_mule;
5930       coding->decoder = decode_coding_emacs_mule;
5931       coding->encoder = encode_coding_emacs_mule;
5932       coding->common_flags
5933         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5934       coding->spec.emacs_mule.full_support = 1;
5935       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5936           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5937         {
5938           Lisp_Object tail, safe_charsets;
5939           int max_charset_id = 0;
5940
5941           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5942                tail = XCDR (tail))
5943             if (max_charset_id < XFASTINT (XCAR (tail)))
5944               max_charset_id = XFASTINT (XCAR (tail));
5945           safe_charsets = make_uninit_string (max_charset_id + 1);
5946           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5947           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5948                tail = XCDR (tail))
5949             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5950           coding->max_charset_id = max_charset_id;
5951           coding->safe_charsets = SDATA (safe_charsets);
5952           coding->spec.emacs_mule.full_support = 1;
5953         }
5954       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5955       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5956     }
5957   else if (EQ (coding_type, Qshift_jis))
5958     {
5959       coding->detector = detect_coding_sjis;
5960       coding->decoder = decode_coding_sjis;
5961       coding->encoder = encode_coding_sjis;
5962       coding->common_flags
5963         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5964     }
5965   else if (EQ (coding_type, Qbig5))
5966     {
5967       coding->detector = detect_coding_big5;
5968       coding->decoder = decode_coding_big5;
5969       coding->encoder = encode_coding_big5;
5970       coding->common_flags
5971         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5972     }
5973   else                          /* EQ (coding_type, Qraw_text) */
5974     {
5975       coding->detector = NULL;
5976       coding->decoder = decode_coding_raw_text;
5977       coding->encoder = encode_coding_raw_text;
5978       if (! EQ (eol_type, Qunix))
5979         {
5980           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5981           if (! VECTORP (eol_type))
5982             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5983         }
5984
5985     }
5986
5987   return;
5988 }
5989
5990 /* Return a list of charsets supported by CODING.  */
5991
5992 Lisp_Object
5993 coding_charset_list (coding)
5994      struct coding_system *coding;
5995 {
5996   Lisp_Object attrs, charset_list;
5997
5998   CODING_GET_INFO (coding, attrs, charset_list);
5999   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6000     {
6001       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6002
6003       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6004         charset_list = Viso_2022_charset_list;
6005     }
6006   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6007     {
6008       charset_list = Vemacs_mule_charset_list;
6009     }
6010   return charset_list;
6011 }
6012
6013
6014 /* Return a list of charsets supported by CODING-SYSTEM.  */
6015
6016 Lisp_Object
6017 coding_system_charset_list (coding_system)
6018      Lisp_Object coding_system;
6019 {
6020   int id;
6021   Lisp_Object attrs, charset_list;
6022
6023   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6024   attrs = CODING_ID_ATTRS (id);
6025
6026   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6027     {
6028       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6029
6030       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6031         charset_list = Viso_2022_charset_list;
6032       else
6033         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6034     }
6035   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6036     {
6037       charset_list = Vemacs_mule_charset_list;
6038     }
6039   else
6040     {
6041       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6042     }
6043   return charset_list;
6044 }
6045
6046
6047 /* Return raw-text or one of its subsidiaries that has the same
6048    eol_type as CODING-SYSTEM.  */
6049
6050 Lisp_Object
6051 raw_text_coding_system (coding_system)
6052      Lisp_Object coding_system;
6053 {
6054   Lisp_Object spec, attrs;
6055   Lisp_Object eol_type, raw_text_eol_type;
6056
6057   if (NILP (coding_system))
6058     return Qraw_text;
6059   spec = CODING_SYSTEM_SPEC (coding_system);
6060   attrs = AREF (spec, 0);
6061
6062   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6063     return coding_system;
6064
6065   eol_type = AREF (spec, 2);
6066   if (VECTORP (eol_type))
6067     return Qraw_text;
6068   spec = CODING_SYSTEM_SPEC (Qraw_text);
6069   raw_text_eol_type = AREF (spec, 2);
6070   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6071           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6072           : AREF (raw_text_eol_type, 2));
6073 }
6074
6075
6076 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6077    the subsidiary that has the same eol-spec as PARENT (if it is not
6078    nil and specifies end-of-line format) or the system's setting
6079    (system_eol_type).  */
6080
6081 Lisp_Object
6082 coding_inherit_eol_type (coding_system, parent)
6083      Lisp_Object coding_system, parent;
6084 {
6085   Lisp_Object spec, eol_type;
6086
6087   if (NILP (coding_system))
6088     coding_system = Qraw_text;
6089   spec = CODING_SYSTEM_SPEC (coding_system);
6090   eol_type = AREF (spec, 2);
6091   if (VECTORP (eol_type))
6092     {
6093       Lisp_Object parent_eol_type;
6094
6095       if (! NILP (parent))
6096         {
6097           Lisp_Object parent_spec;
6098
6099           parent_spec = CODING_SYSTEM_SPEC (parent);
6100           parent_eol_type = AREF (parent_spec, 2);
6101           if (VECTORP (parent_eol_type))
6102             parent_eol_type = system_eol_type;
6103         }
6104       else
6105         parent_eol_type = system_eol_type;
6106       if (EQ (parent_eol_type, Qunix))
6107         coding_system = AREF (eol_type, 0);
6108       else if (EQ (parent_eol_type, Qdos))
6109         coding_system = AREF (eol_type, 1);
6110       else if (EQ (parent_eol_type, Qmac))
6111         coding_system = AREF (eol_type, 2);
6112     }
6113   return coding_system;
6114 }
6115
6116
6117 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6118    decided for writing to a process.  If not, complement them, and
6119    return a new coding system.  */
6120
6121 Lisp_Object
6122 complement_process_encoding_system (coding_system)
6123      Lisp_Object coding_system;
6124 {
6125   Lisp_Object spec, attrs, coding_type, eol_type;
6126
6127   if (NILP (coding_system))
6128     coding_system = Qundecided;
6129   spec = CODING_SYSTEM_SPEC (coding_system);
6130   attrs = AREF (spec, 0);
6131   coding_type = CODING_ATTR_TYPE (attrs);
6132   eol_type = AREF (spec, 2);
6133
6134   if (EQ (coding_type, Qundecided))
6135     {
6136       /* We must decide the text-conversion part ar first.  */
6137       if (CONSP (Vdefault_process_coding_system))
6138         {
6139           coding_system = XCDR (Vdefault_process_coding_system);
6140           if (! NILP (coding_system))
6141             {
6142               spec = CODING_SYSTEM_SPEC (coding_system);
6143               attrs = AREF (spec, 0);
6144               coding_type = CODING_ATTR_TYPE (attrs);
6145               eol_type = AREF (spec, 2);
6146             }
6147         }
6148       if (EQ (coding_type, Qundecided))
6149         {
6150           coding_system = preferred_coding_system ();
6151           spec = CODING_SYSTEM_SPEC (coding_system);
6152           attrs = AREF (spec, 0);
6153           coding_type = CODING_ATTR_TYPE (attrs);
6154           eol_type = AREF (spec, 2);
6155         }
6156       if (EQ (coding_type, Qundecided))
6157         {
6158           coding_system = Qraw_text;
6159           coding_type = Qraw_text;
6160           eol_type = Qnil;
6161         }
6162     }
6163   if (NILP (eol_type) || VECTORP (eol_type))
6164     {
6165       /* We must decide the eol-conversion part.  */
6166       coding_system = coding_inherit_eol_type (coding_system, coding_system);
6167     }
6168
6169   return coding_system;
6170 }
6171
6172
6173 /* Emacs has a mechanism to automatically detect a coding system if it
6174    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6175    it's impossible to distinguish some coding systems accurately
6176    because they use the same range of codes.  So, at first, coding
6177    systems are categorized into 7, those are:
6178
6179    o coding-category-emacs-mule
6180
6181         The category for a coding system which has the same code range
6182         as Emacs' internal format.  Assigned the coding-system (Lisp
6183         symbol) `emacs-mule' by default.
6184
6185    o coding-category-sjis
6186
6187         The category for a coding system which has the same code range
6188         as SJIS.  Assigned the coding-system (Lisp
6189         symbol) `japanese-shift-jis' by default.
6190
6191    o coding-category-iso-7
6192
6193         The category for a coding system which has the same code range
6194         as ISO2022 of 7-bit environment.  This doesn't use any locking
6195         shift and single shift functions.  This can encode/decode all
6196         charsets.  Assigned the coding-system (Lisp symbol)
6197         `iso-2022-7bit' by default.
6198
6199    o coding-category-iso-7-tight
6200
6201         Same as coding-category-iso-7 except that this can
6202         encode/decode only the specified charsets.
6203
6204    o coding-category-iso-8-1
6205
6206         The category for a coding system which has the same code range
6207         as ISO2022 of 8-bit environment and graphic plane 1 used only
6208         for DIMENSION1 charset.  This doesn't use any locking shift
6209         and single shift functions.  Assigned the coding-system (Lisp
6210         symbol) `iso-latin-1' by default.
6211
6212    o coding-category-iso-8-2
6213
6214         The category for a coding system which has the same code range
6215         as ISO2022 of 8-bit environment and graphic plane 1 used only
6216         for DIMENSION2 charset.  This doesn't use any locking shift
6217         and single shift functions.  Assigned the coding-system (Lisp
6218         symbol) `japanese-iso-8bit' by default.
6219
6220    o coding-category-iso-7-else
6221
6222         The category for a coding system which has the same code range
6223         as ISO2022 of 7-bit environemnt but uses locking shift or
6224         single shift functions.  Assigned the coding-system (Lisp
6225         symbol) `iso-2022-7bit-lock' by default.
6226
6227    o coding-category-iso-8-else
6228
6229         The category for a coding system which has the same code range
6230         as ISO2022 of 8-bit environemnt but uses locking shift or
6231         single shift functions.  Assigned the coding-system (Lisp
6232         symbol) `iso-2022-8bit-ss2' by default.
6233
6234    o coding-category-big5
6235
6236         The category for a coding system which has the same code range
6237         as BIG5.  Assigned the coding-system (Lisp symbol)
6238         `cn-big5' by default.
6239
6240    o coding-category-utf-8
6241
6242         The category for a coding system which has the same code range
6243         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6244         symbol) `utf-8' by default.
6245
6246    o coding-category-utf-16-be
6247
6248         The category for a coding system in which a text has an
6249         Unicode signature (cf. Unicode Standard) in the order of BIG
6250         endian at the head.  Assigned the coding-system (Lisp symbol)
6251         `utf-16-be' by default.
6252
6253    o coding-category-utf-16-le
6254
6255         The category for a coding system in which a text has an
6256         Unicode signature (cf. Unicode Standard) in the order of
6257         LITTLE endian at the head.  Assigned the coding-system (Lisp
6258         symbol) `utf-16-le' by default.
6259
6260    o coding-category-ccl
6261
6262         The category for a coding system of which encoder/decoder is
6263         written in CCL programs.  The default value is nil, i.e., no
6264         coding system is assigned.
6265
6266    o coding-category-binary
6267
6268         The category for a coding system not categorized in any of the
6269         above.  Assigned the coding-system (Lisp symbol)
6270         `no-conversion' by default.
6271
6272    Each of them is a Lisp symbol and the value is an actual
6273    `coding-system's (this is also a Lisp symbol) assigned by a user.
6274    What Emacs does actually is to detect a category of coding system.
6275    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6276    decide only one possible category, it selects a category of the
6277    highest priority.  Priorities of categories are also specified by a
6278    user in a Lisp variable `coding-category-list'.
6279
6280 */
6281
6282 #define EOL_SEEN_NONE   0
6283 #define EOL_SEEN_LF     1
6284 #define EOL_SEEN_CR     2
6285 #define EOL_SEEN_CRLF   4
6286
6287 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6288    SOURCE is encoded.  If CATEGORY is one of
6289    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6290    two-byte, else they are encoded by one-byte.
6291
6292    Return one of EOL_SEEN_XXX.  */
6293
6294 #define MAX_EOL_CHECK_COUNT 3
6295
6296 static int
6297 detect_eol (source, src_bytes, category)
6298      const unsigned char *source;
6299      EMACS_INT src_bytes;
6300      enum coding_category category;
6301 {
6302   const unsigned char *src = source, *src_end = src + src_bytes;
6303   unsigned char c;
6304   int total  = 0;
6305   int eol_seen = EOL_SEEN_NONE;
6306
6307   if ((1 << category) & CATEGORY_MASK_UTF_16)
6308     {
6309       int msb, lsb;
6310
6311       msb = category == (coding_category_utf_16_le
6312                          | coding_category_utf_16_le_nosig);
6313       lsb = 1 - msb;
6314
6315       while (src + 1 < src_end)
6316         {
6317           c = src[lsb];
6318           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6319             {
6320               int this_eol;
6321
6322               if (c == '\n')
6323                 this_eol = EOL_SEEN_LF;
6324               else if (src + 3 >= src_end
6325                        || src[msb + 2] != 0
6326                        || src[lsb + 2] != '\n')
6327                 this_eol = EOL_SEEN_CR;
6328               else
6329                 {
6330                   this_eol = EOL_SEEN_CRLF;
6331                   src += 2;
6332                 }
6333
6334               if (eol_seen == EOL_SEEN_NONE)
6335                 /* This is the first end-of-line.  */
6336                 eol_seen = this_eol;
6337               else if (eol_seen != this_eol)
6338                 {
6339                   /* The found type is different from what found before.
6340                      Allow for stray ^M characters in DOS EOL files.  */
6341                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6342                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6343                     eol_seen = EOL_SEEN_CRLF;
6344                   else
6345                     {
6346                       eol_seen = EOL_SEEN_LF;
6347                       break;
6348                     }
6349                 }
6350               if (++total == MAX_EOL_CHECK_COUNT)
6351                 break;
6352             }
6353           src += 2;
6354         }
6355     }
6356   else
6357     {
6358       while (src < src_end)
6359         {
6360           c = *src++;
6361           if (c == '\n' || c == '\r')
6362             {
6363               int this_eol;
6364
6365               if (c == '\n')
6366                 this_eol = EOL_SEEN_LF;
6367               else if (src >= src_end || *src != '\n')
6368                 this_eol = EOL_SEEN_CR;
6369               else
6370                 this_eol = EOL_SEEN_CRLF, src++;
6371
6372               if (eol_seen == EOL_SEEN_NONE)
6373                 /* This is the first end-of-line.  */
6374                 eol_seen = this_eol;
6375               else if (eol_seen != this_eol)
6376                 {
6377                   /* The found type is different from what found before.
6378                      Allow for stray ^M characters in DOS EOL files.  */
6379                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6380                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6381                     eol_seen = EOL_SEEN_CRLF;
6382                   else
6383                     {
6384                       eol_seen = EOL_SEEN_LF;
6385                       break;
6386                     }
6387                 }
6388               if (++total == MAX_EOL_CHECK_COUNT)
6389                 break;
6390             }
6391         }
6392     }
6393   return eol_seen;
6394 }
6395
6396
6397 static Lisp_Object
6398 adjust_coding_eol_type (coding, eol_seen)
6399      struct coding_system *coding;
6400      int eol_seen;
6401 {
6402   Lisp_Object eol_type;
6403
6404   eol_type = CODING_ID_EOL_TYPE (coding->id);
6405   if (eol_seen & EOL_SEEN_LF)
6406     {
6407       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6408       eol_type = Qunix;
6409     }
6410   else if (eol_seen & EOL_SEEN_CRLF)
6411     {
6412       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6413       eol_type = Qdos;
6414     }
6415   else if (eol_seen & EOL_SEEN_CR)
6416     {
6417       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6418       eol_type = Qmac;
6419     }
6420   return eol_type;
6421 }
6422
6423 /* Detect how a text specified in CODING is encoded.  If a coding
6424    system is detected, update fields of CODING by the detected coding
6425    system.  */
6426
6427 void
6428 detect_coding (coding)
6429      struct coding_system *coding;
6430 {
6431   const unsigned char *src, *src_end;
6432   int saved_mode = coding->mode;
6433
6434   coding->consumed = coding->consumed_char = 0;
6435   coding->produced = coding->produced_char = 0;
6436   coding_set_source (coding);
6437
6438   src_end = coding->source + coding->src_bytes;
6439   coding->head_ascii = 0;
6440
6441   /* If we have not yet decided the text encoding type, detect it
6442      now.  */
6443   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6444     {
6445       int c, i;
6446       struct coding_detection_info detect_info;
6447       int null_byte_found = 0, eight_bit_found = 0;
6448
6449       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6450       for (src = coding->source; src < src_end; src++)
6451         {
6452           c = *src;
6453           if (c & 0x80)
6454             {
6455               eight_bit_found = 1;
6456               if (null_byte_found)
6457                 break;
6458             }
6459           else if (c < 0x20)
6460             {
6461               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6462                   && ! inhibit_iso_escape_detection
6463                   && ! detect_info.checked)
6464                 {
6465                   if (detect_coding_iso_2022 (coding, &detect_info))
6466                     {
6467                       /* We have scanned the whole data.  */
6468                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6469                         {
6470                           /* We didn't find an 8-bit code.  We may
6471                              have found a null-byte, but it's very
6472                              rare that a binary file confirm to
6473                              ISO-2022.  */
6474                           src = src_end;
6475                           coding->head_ascii = src - coding->source;
6476                         }
6477                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6478                       break;
6479                     }
6480                 }
6481               else if (! c && !inhibit_null_byte_detection)
6482                 {
6483                   null_byte_found = 1;
6484                   if (eight_bit_found)
6485                     break;
6486                 }
6487               if (! eight_bit_found)
6488                 coding->head_ascii++;
6489             }
6490           else if (! eight_bit_found)
6491             coding->head_ascii++;
6492         }
6493
6494       if (null_byte_found || eight_bit_found
6495           || coding->head_ascii < coding->src_bytes
6496           || detect_info.found)
6497         {
6498           enum coding_category category;
6499           struct coding_system *this;
6500
6501           if (coding->head_ascii == coding->src_bytes)
6502             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6503             for (i = 0; i < coding_category_raw_text; i++)
6504               {
6505                 category = coding_priorities[i];
6506                 this = coding_categories + category;
6507                 if (detect_info.found & (1 << category))
6508                   break;
6509               }
6510           else
6511             {
6512               if (null_byte_found)
6513                 {
6514                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6515                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6516                 }
6517               for (i = 0; i < coding_category_raw_text; i++)
6518                 {
6519                   category = coding_priorities[i];
6520                   this = coding_categories + category;
6521                   if (this->id < 0)
6522                     {
6523                       /* No coding system of this category is defined.  */
6524                       detect_info.rejected |= (1 << category);
6525                     }
6526                   else if (category >= coding_category_raw_text)
6527                     continue;
6528                   else if (detect_info.checked & (1 << category))
6529                     {
6530                       if (detect_info.found & (1 << category))
6531                         break;
6532                     }
6533                   else if ((*(this->detector)) (coding, &detect_info)
6534                            && detect_info.found & (1 << category))
6535                     {
6536                       if (category == coding_category_utf_16_auto)
6537                         {
6538                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6539                             category = coding_category_utf_16_le;
6540                           else
6541                             category = coding_category_utf_16_be;
6542                         }
6543                       break;
6544                     }
6545                 }
6546             }
6547
6548           if (i < coding_category_raw_text)
6549             setup_coding_system (CODING_ID_NAME (this->id), coding);
6550           else if (null_byte_found)
6551             setup_coding_system (Qno_conversion, coding);
6552           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6553                    == CATEGORY_MASK_ANY)
6554             setup_coding_system (Qraw_text, coding);
6555           else if (detect_info.rejected)
6556             for (i = 0; i < coding_category_raw_text; i++)
6557               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6558                 {
6559                   this = coding_categories + coding_priorities[i];
6560                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6561                   break;
6562                 }
6563         }
6564     }
6565   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6566            == coding_category_utf_8_auto)
6567     {
6568       Lisp_Object coding_systems;
6569       struct coding_detection_info detect_info;
6570
6571       coding_systems
6572         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6573       detect_info.found = detect_info.rejected = 0;
6574       coding->head_ascii = 0;
6575       if (CONSP (coding_systems)
6576           && detect_coding_utf_8 (coding, &detect_info))
6577         {
6578           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6579             setup_coding_system (XCAR (coding_systems), coding);
6580           else
6581             setup_coding_system (XCDR (coding_systems), coding);
6582         }
6583     }
6584   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6585            == coding_category_utf_16_auto)
6586     {
6587       Lisp_Object coding_systems;
6588       struct coding_detection_info detect_info;
6589
6590       coding_systems
6591         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6592       detect_info.found = detect_info.rejected = 0;
6593       coding->head_ascii = 0;
6594       if (CONSP (coding_systems)
6595           && detect_coding_utf_16 (coding, &detect_info))
6596         {
6597           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6598             setup_coding_system (XCAR (coding_systems), coding);
6599           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6600             setup_coding_system (XCDR (coding_systems), coding);
6601         }
6602     }
6603   coding->mode = saved_mode;
6604 }
6605
6606
6607 static void
6608 decode_eol (coding)
6609      struct coding_system *coding;
6610 {
6611   Lisp_Object eol_type;
6612   unsigned char *p, *pbeg, *pend;
6613
6614   eol_type = CODING_ID_EOL_TYPE (coding->id);
6615   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6616     return;
6617
6618   if (NILP (coding->dst_object))
6619     pbeg = coding->destination;
6620   else
6621     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6622   pend = pbeg + coding->produced;
6623
6624   if (VECTORP (eol_type))
6625     {
6626       int eol_seen = EOL_SEEN_NONE;
6627
6628       for (p = pbeg; p < pend; p++)
6629         {
6630           if (*p == '\n')
6631             eol_seen |= EOL_SEEN_LF;
6632           else if (*p == '\r')
6633             {
6634               if (p + 1 < pend && *(p + 1) == '\n')
6635                 {
6636                   eol_seen |= EOL_SEEN_CRLF;
6637                   p++;
6638                 }
6639               else
6640                 eol_seen |= EOL_SEEN_CR;
6641             }
6642         }
6643       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6644       if ((eol_seen & EOL_SEEN_CRLF) != 0
6645           && (eol_seen & EOL_SEEN_CR) != 0
6646           && (eol_seen & EOL_SEEN_LF) == 0)
6647         eol_seen = EOL_SEEN_CRLF;
6648       else if (eol_seen != EOL_SEEN_NONE
6649           && eol_seen != EOL_SEEN_LF
6650           && eol_seen != EOL_SEEN_CRLF
6651           && eol_seen != EOL_SEEN_CR)
6652         eol_seen = EOL_SEEN_LF;
6653       if (eol_seen != EOL_SEEN_NONE)
6654         eol_type = adjust_coding_eol_type (coding, eol_seen);
6655     }
6656
6657   if (EQ (eol_type, Qmac))
6658     {
6659       for (p = pbeg; p < pend; p++)
6660         if (*p == '\r')
6661           *p = '\n';
6662     }
6663   else if (EQ (eol_type, Qdos))
6664     {
6665       int n = 0;
6666
6667       if (NILP (coding->dst_object))
6668         {
6669           /* Start deleting '\r' from the tail to minimize the memory
6670              movement.  */
6671           for (p = pend - 2; p >= pbeg; p--)
6672             if (*p == '\r')
6673               {
6674                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6675                 n++;
6676               }
6677         }
6678       else
6679         {
6680           int pos_byte = coding->dst_pos_byte;
6681           int pos = coding->dst_pos;
6682           int pos_end = pos + coding->produced_char - 1;
6683
6684           while (pos < pos_end)
6685             {
6686               p = BYTE_POS_ADDR (pos_byte);
6687               if (*p == '\r' && p[1] == '\n')
6688                 {
6689                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6690                   n++;
6691                   pos_end--;
6692                 }
6693               pos++;
6694               if (coding->dst_multibyte)
6695                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6696               else
6697                 pos_byte++;
6698             }
6699         }
6700       coding->produced -= n;
6701       coding->produced_char -= n;
6702     }
6703 }
6704
6705
6706 /* Return a translation table (or list of them) from coding system
6707    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6708    decoding (ENCODEP is zero). */
6709
6710 static Lisp_Object
6711 get_translation_table (attrs, encodep, max_lookup)
6712      Lisp_Object attrs;
6713      int encodep, *max_lookup;
6714 {
6715   Lisp_Object standard, translation_table;
6716   Lisp_Object val;
6717
6718   if (NILP (Venable_character_translation))
6719     {
6720       if (max_lookup)
6721         *max_lookup = 0;
6722       return Qnil;
6723     }
6724   if (encodep)
6725     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6726       standard = Vstandard_translation_table_for_encode;
6727   else
6728     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6729       standard = Vstandard_translation_table_for_decode;
6730   if (NILP (translation_table))
6731     translation_table = standard;
6732   else
6733     {
6734       if (SYMBOLP (translation_table))
6735         translation_table = Fget (translation_table, Qtranslation_table);
6736       else if (CONSP (translation_table))
6737         {
6738           translation_table = Fcopy_sequence (translation_table);
6739           for (val = translation_table; CONSP (val); val = XCDR (val))
6740             if (SYMBOLP (XCAR (val)))
6741               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6742         }
6743       if (CHAR_TABLE_P (standard))
6744         {
6745           if (CONSP (translation_table))
6746             translation_table = nconc2 (translation_table,
6747                                         Fcons (standard, Qnil));
6748           else
6749             translation_table = Fcons (translation_table,
6750                                        Fcons (standard, Qnil));
6751         }
6752     }
6753
6754   if (max_lookup)
6755     {
6756       *max_lookup = 1;
6757       if (CHAR_TABLE_P (translation_table)
6758           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6759         {
6760           val = XCHAR_TABLE (translation_table)->extras[1];
6761           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6762             *max_lookup = XFASTINT (val);
6763         }
6764       else if (CONSP (translation_table))
6765         {
6766           Lisp_Object tail, val;
6767
6768           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6769             if (CHAR_TABLE_P (XCAR (tail))
6770                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6771               {
6772                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6773                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6774                   *max_lookup = XFASTINT (val);
6775               }
6776         }
6777     }
6778   return translation_table;
6779 }
6780
6781 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6782   do {                                                          \
6783     trans = Qnil;                                               \
6784     if (CHAR_TABLE_P (table))                                   \
6785       {                                                         \
6786         trans = CHAR_TABLE_REF (table, c);                      \
6787         if (CHARACTERP (trans))                                 \
6788           c = XFASTINT (trans), trans = Qnil;                   \
6789       }                                                         \
6790     else if (CONSP (table))                                     \
6791       {                                                         \
6792         Lisp_Object tail;                                       \
6793                                                                 \
6794         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6795           if (CHAR_TABLE_P (XCAR (tail)))                       \
6796             {                                                   \
6797               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6798               if (CHARACTERP (trans))                           \
6799                 c = XFASTINT (trans), trans = Qnil;             \
6800               else if (! NILP (trans))                          \
6801                 break;                                          \
6802             }                                                   \
6803       }                                                         \
6804   } while (0)
6805
6806
6807 /* Return a translation of character(s) at BUF according to TRANS.
6808    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6809    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6810    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6811    translation is found, and Qnil if not found..
6812    If BUF is too short to lookup characters in FROM, return Qt.  */
6813
6814 static Lisp_Object
6815 get_translation (trans, buf, buf_end)
6816      Lisp_Object trans;
6817      int *buf, *buf_end;
6818 {
6819
6820   if (INTEGERP (trans))
6821     return trans;
6822   for (; CONSP (trans); trans = XCDR (trans))
6823     {
6824       Lisp_Object val = XCAR (trans);
6825       Lisp_Object from = XCAR (val);
6826       int len = ASIZE (from);
6827       int i;
6828
6829       for (i = 0; i < len; i++)
6830         {
6831           if (buf + i == buf_end)
6832             return Qt;
6833           if (XINT (AREF (from, i)) != buf[i])
6834             break;
6835         }
6836       if (i == len)
6837         return val;
6838     }
6839   return Qnil;
6840 }
6841
6842
6843 static int
6844 produce_chars (coding, translation_table, last_block)
6845      struct coding_system *coding;
6846      Lisp_Object translation_table;
6847      int last_block;
6848 {
6849   unsigned char *dst = coding->destination + coding->produced;
6850   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6851   EMACS_INT produced;
6852   EMACS_INT produced_chars = 0;
6853   int carryover = 0;
6854
6855   if (! coding->chars_at_source)
6856     {
6857       /* Source characters are in coding->charbuf.  */
6858       int *buf = coding->charbuf;
6859       int *buf_end = buf + coding->charbuf_used;
6860
6861       if (EQ (coding->src_object, coding->dst_object))
6862         {
6863           coding_set_source (coding);
6864           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6865         }
6866
6867       while (buf < buf_end)
6868         {
6869           int c = *buf, i;
6870
6871           if (c >= 0)
6872             {
6873               int from_nchars = 1, to_nchars = 1;
6874               Lisp_Object trans = Qnil;
6875
6876               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6877               if (! NILP (trans))
6878                 {
6879                   trans = get_translation (trans, buf, buf_end);
6880                   if (INTEGERP (trans))
6881                     c = XINT (trans);
6882                   else if (CONSP (trans))
6883                     {
6884                       from_nchars = ASIZE (XCAR (trans));
6885                       trans = XCDR (trans);
6886                       if (INTEGERP (trans))
6887                         c = XINT (trans);
6888                       else
6889                         {
6890                           to_nchars = ASIZE (trans);
6891                           c = XINT (AREF (trans, 0));
6892                         }
6893                     }
6894                   else if (EQ (trans, Qt) && ! last_block)
6895                     break;
6896                 }
6897
6898               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6899                 {
6900                   dst = alloc_destination (coding,
6901                                            buf_end - buf
6902                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6903                                            dst);
6904                   if (EQ (coding->src_object, coding->dst_object))
6905                     {
6906                       coding_set_source (coding);
6907                       dst_end = (((unsigned char *) coding->source)
6908                                  + coding->consumed);
6909                     }
6910                   else
6911                     dst_end = coding->destination + coding->dst_bytes;
6912                 }
6913
6914               for (i = 0; i < to_nchars; i++)
6915                 {
6916                   if (i > 0)
6917                     c = XINT (AREF (trans, i));
6918                   if (coding->dst_multibyte
6919                       || ! CHAR_BYTE8_P (c))
6920                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6921                   else
6922                     *dst++ = CHAR_TO_BYTE8 (c);
6923                 }
6924               produced_chars += to_nchars;
6925               buf += from_nchars;
6926             }
6927           else
6928             /* This is an annotation datum.  (-C) is the length.  */
6929             buf += -c;
6930         }
6931       carryover = buf_end - buf;
6932     }
6933   else
6934     {
6935       /* Source characters are at coding->source.  */
6936       const unsigned char *src = coding->source;
6937       const unsigned char *src_end = src + coding->consumed;
6938
6939       if (EQ (coding->dst_object, coding->src_object))
6940         dst_end = (unsigned char *) src;
6941       if (coding->src_multibyte != coding->dst_multibyte)
6942         {
6943           if (coding->src_multibyte)
6944             {
6945               int multibytep = 1;
6946               EMACS_INT consumed_chars = 0;
6947
6948               while (1)
6949                 {
6950                   const unsigned char *src_base = src;
6951                   int c;
6952
6953                   ONE_MORE_BYTE (c);
6954                   if (dst == dst_end)
6955                     {
6956                       if (EQ (coding->src_object, coding->dst_object))
6957                         dst_end = (unsigned char *) src;
6958                       if (dst == dst_end)
6959                         {
6960                           EMACS_INT offset = src - coding->source;
6961
6962                           dst = alloc_destination (coding, src_end - src + 1,
6963                                                    dst);
6964                           dst_end = coding->destination + coding->dst_bytes;
6965                           coding_set_source (coding);
6966                           src = coding->source + offset;
6967                           src_end = coding->source + coding->src_bytes;
6968                           if (EQ (coding->src_object, coding->dst_object))
6969                             dst_end = (unsigned char *) src;
6970                         }
6971                     }
6972                   *dst++ = c;
6973                   produced_chars++;
6974                 }
6975             no_more_source:
6976               ;
6977             }
6978           else
6979             while (src < src_end)
6980               {
6981                 int multibytep = 1;
6982                 int c = *src++;
6983
6984                 if (dst >= dst_end - 1)
6985                   {
6986                     if (EQ (coding->src_object, coding->dst_object))
6987                       dst_end = (unsigned char *) src;
6988                     if (dst >= dst_end - 1)
6989                       {
6990                         EMACS_INT offset = src - coding->source;
6991                         EMACS_INT more_bytes;
6992
6993                         if (EQ (coding->src_object, coding->dst_object))
6994                           more_bytes = ((src_end - src) / 2) + 2;
6995                         else
6996                           more_bytes = src_end - src + 2;
6997                         dst = alloc_destination (coding, more_bytes, dst);
6998                         dst_end = coding->destination + coding->dst_bytes;
6999                         coding_set_source (coding);
7000                         src = coding->source + offset;
7001                         src_end = coding->source + coding->src_bytes;
7002                         if (EQ (coding->src_object, coding->dst_object))
7003                           dst_end = (unsigned char *) src;
7004                       }
7005                   }
7006                 EMIT_ONE_BYTE (c);
7007               }
7008         }
7009       else
7010         {
7011           if (!EQ (coding->src_object, coding->dst_object))
7012             {
7013               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
7014
7015               if (require > 0)
7016                 {
7017                   EMACS_INT offset = src - coding->source;
7018
7019                   dst = alloc_destination (coding, require, dst);
7020                   coding_set_source (coding);
7021                   src = coding->source + offset;
7022                   src_end = coding->source + coding->src_bytes;
7023                 }
7024             }
7025           produced_chars = coding->consumed_char;
7026           while (src < src_end)
7027             *dst++ = *src++;
7028         }
7029     }
7030
7031   produced = dst - (coding->destination + coding->produced);
7032   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7033     insert_from_gap (produced_chars, produced);
7034   coding->produced += produced;
7035   coding->produced_char += produced_chars;
7036   return carryover;
7037 }
7038
7039 /* Compose text in CODING->object according to the annotation data at
7040    CHARBUF.  CHARBUF is an array:
7041      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7042  */
7043
7044 static INLINE void
7045 produce_composition (coding, charbuf, pos)
7046      struct coding_system *coding;
7047      int *charbuf;
7048      EMACS_INT pos;
7049 {
7050   int len;
7051   EMACS_INT to;
7052   enum composition_method method;
7053   Lisp_Object components;
7054
7055   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7056   to = pos + charbuf[2];
7057   method = (enum composition_method) (charbuf[4]);
7058
7059   if (method == COMPOSITION_RELATIVE)
7060     components = Qnil;
7061   else
7062     {
7063       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7064       int i, j;
7065
7066       if (method == COMPOSITION_WITH_RULE)
7067         len = charbuf[2] * 3 - 2;
7068       charbuf += MAX_ANNOTATION_LENGTH;
7069       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7070       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7071         {
7072           if (charbuf[i] >= 0)
7073             args[j] = make_number (charbuf[i]);
7074           else
7075             {
7076               i++;
7077               args[j] = make_number (charbuf[i] % 0x100);
7078             }
7079         }
7080       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7081     }
7082   compose_text (pos, to, components, Qnil, coding->dst_object);
7083 }
7084
7085
7086 /* Put `charset' property on text in CODING->object according to
7087    the annotation data at CHARBUF.  CHARBUF is an array:
7088      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7089  */
7090
7091 static INLINE void
7092 produce_charset (coding, charbuf, pos)
7093      struct coding_system *coding;
7094      int *charbuf;
7095      EMACS_INT pos;
7096 {
7097   EMACS_INT from = pos - charbuf[2];
7098   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7099
7100   Fput_text_property (make_number (from), make_number (pos),
7101                       Qcharset, CHARSET_NAME (charset),
7102                       coding->dst_object);
7103 }
7104
7105
7106 #define CHARBUF_SIZE 0x4000
7107
7108 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7109   do {                                                                  \
7110     int size = CHARBUF_SIZE;                                            \
7111                                                                         \
7112     coding->charbuf = NULL;                                             \
7113     while (size > 1024)                                                 \
7114       {                                                                 \
7115         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7116         if (coding->charbuf)                                            \
7117           break;                                                        \
7118         size >>= 1;                                                     \
7119       }                                                                 \
7120     if (! coding->charbuf)                                              \
7121       {                                                                 \
7122         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7123         return coding->result;                                          \
7124       }                                                                 \
7125     coding->charbuf_size = size;                                        \
7126   } while (0)
7127
7128
7129 static void
7130 produce_annotation (coding, pos)
7131      struct coding_system *coding;
7132      EMACS_INT pos;
7133 {
7134   int *charbuf = coding->charbuf;
7135   int *charbuf_end = charbuf + coding->charbuf_used;
7136
7137   if (NILP (coding->dst_object))
7138     return;
7139
7140   while (charbuf < charbuf_end)
7141     {
7142       if (*charbuf >= 0)
7143         pos++, charbuf++;
7144       else
7145         {
7146           int len = -*charbuf;
7147
7148           if (len > 2)
7149             switch (charbuf[1])
7150               {
7151               case CODING_ANNOTATE_COMPOSITION_MASK:
7152                 produce_composition (coding, charbuf, pos);
7153                 break;
7154               case CODING_ANNOTATE_CHARSET_MASK:
7155                 produce_charset (coding, charbuf, pos);
7156                 break;
7157               }
7158           charbuf += len;
7159         }
7160     }
7161 }
7162
7163 /* Decode the data at CODING->src_object into CODING->dst_object.
7164    CODING->src_object is a buffer, a string, or nil.
7165    CODING->dst_object is a buffer.
7166
7167    If CODING->src_object is a buffer, it must be the current buffer.
7168    In this case, if CODING->src_pos is positive, it is a position of
7169    the source text in the buffer, otherwise, the source text is in the
7170    gap area of the buffer, and CODING->src_pos specifies the offset of
7171    the text from GPT (which must be the same as PT).  If this is the
7172    same buffer as CODING->dst_object, CODING->src_pos must be
7173    negative.
7174
7175    If CODING->src_object is a string, CODING->src_pos is an index to
7176    that string.
7177
7178    If CODING->src_object is nil, CODING->source must already point to
7179    the non-relocatable memory area.  In this case, CODING->src_pos is
7180    an offset from CODING->source.
7181
7182    The decoded data is inserted at the current point of the buffer
7183    CODING->dst_object.
7184 */
7185
7186 static int
7187 decode_coding (coding)
7188      struct coding_system *coding;
7189 {
7190   Lisp_Object attrs;
7191   Lisp_Object undo_list;
7192   Lisp_Object translation_table;
7193   struct ccl_spec cclspec;
7194   int carryover;
7195   int i;
7196
7197   if (BUFFERP (coding->src_object)
7198       && coding->src_pos > 0
7199       && coding->src_pos < GPT
7200       && coding->src_pos + coding->src_chars > GPT)
7201     move_gap_both (coding->src_pos, coding->src_pos_byte);
7202
7203   undo_list = Qt;
7204   if (BUFFERP (coding->dst_object))
7205     {
7206       if (current_buffer != XBUFFER (coding->dst_object))
7207         set_buffer_internal (XBUFFER (coding->dst_object));
7208       if (GPT != PT)
7209         move_gap_both (PT, PT_BYTE);
7210       undo_list = current_buffer->undo_list;
7211       current_buffer->undo_list = Qt;
7212     }
7213
7214   coding->consumed = coding->consumed_char = 0;
7215   coding->produced = coding->produced_char = 0;
7216   coding->chars_at_source = 0;
7217   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7218   coding->errors = 0;
7219
7220   ALLOC_CONVERSION_WORK_AREA (coding);
7221
7222   attrs = CODING_ID_ATTRS (coding->id);
7223   translation_table = get_translation_table (attrs, 0, NULL);
7224
7225   carryover = 0;
7226   if (coding->decoder == decode_coding_ccl)
7227     {
7228       coding->spec.ccl = &cclspec;
7229       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7230     }
7231   do
7232     {
7233       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7234
7235       coding_set_source (coding);
7236       coding->annotated = 0;
7237       coding->charbuf_used = carryover;
7238       (*(coding->decoder)) (coding);
7239       coding_set_destination (coding);
7240       carryover = produce_chars (coding, translation_table, 0);
7241       if (coding->annotated)
7242         produce_annotation (coding, pos);
7243       for (i = 0; i < carryover; i++)
7244         coding->charbuf[i]
7245           = coding->charbuf[coding->charbuf_used - carryover + i];
7246     }
7247   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7248          || (coding->consumed < coding->src_bytes
7249              && (coding->result == CODING_RESULT_SUCCESS
7250                  || coding->result == CODING_RESULT_INVALID_SRC)));
7251
7252   if (carryover > 0)
7253     {
7254       coding_set_destination (coding);
7255       coding->charbuf_used = carryover;
7256       produce_chars (coding, translation_table, 1);
7257     }
7258
7259   coding->carryover_bytes = 0;
7260   if (coding->consumed < coding->src_bytes)
7261     {
7262       int nbytes = coding->src_bytes - coding->consumed;
7263       const unsigned char *src;
7264
7265       coding_set_source (coding);
7266       coding_set_destination (coding);
7267       src = coding->source + coding->consumed;
7268
7269       if (coding->mode & CODING_MODE_LAST_BLOCK)
7270         {
7271           /* Flush out unprocessed data as binary chars.  We are sure
7272              that the number of data is less than the size of
7273              coding->charbuf.  */
7274           coding->charbuf_used = 0;
7275           coding->chars_at_source = 0;
7276
7277           while (nbytes-- > 0)
7278             {
7279               int c = *src++;
7280
7281               if (c & 0x80)
7282                 c = BYTE8_TO_CHAR (c);
7283               coding->charbuf[coding->charbuf_used++] = c;
7284             }
7285           produce_chars (coding, Qnil, 1);
7286         }
7287       else
7288         {
7289           /* Record unprocessed bytes in coding->carryover.  We are
7290              sure that the number of data is less than the size of
7291              coding->carryover.  */
7292           unsigned char *p = coding->carryover;
7293
7294           if (nbytes > sizeof coding->carryover)
7295             nbytes = sizeof coding->carryover;
7296           coding->carryover_bytes = nbytes;
7297           while (nbytes-- > 0)
7298             *p++ = *src++;
7299         }
7300       coding->consumed = coding->src_bytes;
7301     }
7302
7303   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7304       && !inhibit_eol_conversion)
7305     decode_eol (coding);
7306   if (BUFFERP (coding->dst_object))
7307     {
7308       current_buffer->undo_list = undo_list;
7309       record_insert (coding->dst_pos, coding->produced_char);
7310     }
7311   return coding->result;
7312 }
7313
7314
7315 /* Extract an annotation datum from a composition starting at POS and
7316    ending before LIMIT of CODING->src_object (buffer or string), store
7317    the data in BUF, set *STOP to a starting position of the next
7318    composition (if any) or to LIMIT, and return the address of the
7319    next element of BUF.
7320
7321    If such an annotation is not found, set *STOP to a starting
7322    position of a composition after POS (if any) or to LIMIT, and
7323    return BUF.  */
7324
7325 static INLINE int *
7326 handle_composition_annotation (pos, limit, coding, buf, stop)
7327      EMACS_INT pos, limit;
7328      struct coding_system *coding;
7329      int *buf;
7330      EMACS_INT *stop;
7331 {
7332   EMACS_INT start, end;
7333   Lisp_Object prop;
7334
7335   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7336       || end > limit)
7337     *stop = limit;
7338   else if (start > pos)
7339     *stop = start;
7340   else
7341     {
7342       if (start == pos)
7343         {
7344           /* We found a composition.  Store the corresponding
7345              annotation data in BUF.  */
7346           int *head = buf;
7347           enum composition_method method = COMPOSITION_METHOD (prop);
7348           int nchars = COMPOSITION_LENGTH (prop);
7349
7350           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7351           if (method != COMPOSITION_RELATIVE)
7352             {
7353               Lisp_Object components;
7354               int len, i, i_byte;
7355
7356               components = COMPOSITION_COMPONENTS (prop);
7357               if (VECTORP (components))
7358                 {
7359                   len = XVECTOR (components)->size;
7360                   for (i = 0; i < len; i++)
7361                     *buf++ = XINT (AREF (components, i));
7362                 }
7363               else if (STRINGP (components))
7364                 {
7365                   len = SCHARS (components);
7366                   i = i_byte = 0;
7367                   while (i < len)
7368                     {
7369                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7370                       buf++;
7371                     }
7372                 }
7373               else if (INTEGERP (components))
7374                 {
7375                   len = 1;
7376                   *buf++ = XINT (components);
7377                 }
7378               else if (CONSP (components))
7379                 {
7380                   for (len = 0; CONSP (components);
7381                        len++, components = XCDR (components))
7382                     *buf++ = XINT (XCAR (components));
7383                 }
7384               else
7385                 abort ();
7386               *head -= len;
7387             }
7388         }
7389
7390       if (find_composition (end, limit, &start, &end, &prop,
7391                             coding->src_object)
7392           && end <= limit)
7393         *stop = start;
7394       else
7395         *stop = limit;
7396     }
7397   return buf;
7398 }
7399
7400
7401 /* Extract an annotation datum from a text property `charset' at POS of
7402    CODING->src_object (buffer of string), store the data in BUF, set
7403    *STOP to the position where the value of `charset' property changes
7404    (limiting by LIMIT), and return the address of the next element of
7405    BUF.
7406
7407    If the property value is nil, set *STOP to the position where the
7408    property value is non-nil (limiting by LIMIT), and return BUF.  */
7409
7410 static INLINE int *
7411 handle_charset_annotation (pos, limit, coding, buf, stop)
7412      EMACS_INT pos, limit;
7413      struct coding_system *coding;
7414      int *buf;
7415      EMACS_INT *stop;
7416 {
7417   Lisp_Object val, next;
7418   int id;
7419
7420   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7421   if (! NILP (val) && CHARSETP (val))
7422     id = XINT (CHARSET_SYMBOL_ID (val));
7423   else
7424     id = -1;
7425   ADD_CHARSET_DATA (buf, 0, id);
7426   next = Fnext_single_property_change (make_number (pos), Qcharset,
7427                                        coding->src_object,
7428                                        make_number (limit));
7429   *stop = XINT (next);
7430   return buf;
7431 }
7432
7433
7434 static void
7435 consume_chars (coding, translation_table, max_lookup)
7436      struct coding_system *coding;
7437      Lisp_Object translation_table;
7438      int max_lookup;
7439 {
7440   int *buf = coding->charbuf;
7441   int *buf_end = coding->charbuf + coding->charbuf_size;
7442   const unsigned char *src = coding->source + coding->consumed;
7443   const unsigned char *src_end = coding->source + coding->src_bytes;
7444   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7445   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7446   int multibytep = coding->src_multibyte;
7447   Lisp_Object eol_type;
7448   int c;
7449   EMACS_INT stop, stop_composition, stop_charset;
7450   int *lookup_buf = NULL;
7451
7452   if (! NILP (translation_table))
7453     lookup_buf = alloca (sizeof (int) * max_lookup);
7454
7455   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7456   if (VECTORP (eol_type))
7457     eol_type = Qunix;
7458
7459   /* Note: composition handling is not yet implemented.  */
7460   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7461
7462   if (NILP (coding->src_object))
7463     stop = stop_composition = stop_charset = end_pos;
7464   else
7465     {
7466       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7467         stop = stop_composition = pos;
7468       else
7469         stop = stop_composition = end_pos;
7470       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7471         stop = stop_charset = pos;
7472       else
7473         stop_charset = end_pos;
7474     }
7475
7476   /* Compensate for CRLF and conversion.  */
7477   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7478   while (buf < buf_end)
7479     {
7480       Lisp_Object trans;
7481
7482       if (pos == stop)
7483         {
7484           if (pos == end_pos)
7485             break;
7486           if (pos == stop_composition)
7487             buf = handle_composition_annotation (pos, end_pos, coding,
7488                                                  buf, &stop_composition);
7489           if (pos == stop_charset)
7490             buf = handle_charset_annotation (pos, end_pos, coding,
7491                                              buf, &stop_charset);
7492           stop = (stop_composition < stop_charset
7493                   ? stop_composition : stop_charset);
7494         }
7495
7496       if (! multibytep)
7497         {
7498           EMACS_INT bytes;
7499
7500           if (coding->encoder == encode_coding_raw_text
7501               || coding->encoder == encode_coding_ccl)
7502             c = *src++, pos++;
7503           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7504             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7505           else
7506             c = BYTE8_TO_CHAR (*src), src++, pos++;
7507         }
7508       else
7509         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7510       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7511         c = '\n';
7512       if (! EQ (eol_type, Qunix))
7513         {
7514           if (c == '\n')
7515             {
7516               if (EQ (eol_type, Qdos))
7517                 *buf++ = '\r';
7518               else
7519                 c = '\r';
7520             }
7521         }
7522
7523       trans = Qnil;
7524       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7525       if (NILP (trans))
7526         *buf++ = c;
7527       else
7528         {
7529           int from_nchars = 1, to_nchars = 1;
7530           int *lookup_buf_end;
7531           const unsigned char *p = src;
7532           int i;
7533
7534           lookup_buf[0] = c;
7535           for (i = 1; i < max_lookup && p < src_end; i++)
7536             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7537           lookup_buf_end = lookup_buf + i;
7538           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7539           if (INTEGERP (trans))
7540             c = XINT (trans);
7541           else if (CONSP (trans))
7542             {
7543               from_nchars = ASIZE (XCAR (trans));
7544               trans = XCDR (trans);
7545               if (INTEGERP (trans))
7546                 c = XINT (trans);
7547               else
7548                 {
7549                   to_nchars = ASIZE (trans);
7550                   if (buf + to_nchars > buf_end)
7551                     break;
7552                   c = XINT (AREF (trans, 0));
7553                 }
7554             }
7555           else
7556             break;
7557           *buf++ = c;
7558           for (i = 1; i < to_nchars; i++)
7559             *buf++ = XINT (AREF (trans, i));
7560           for (i = 1; i < from_nchars; i++, pos++)
7561             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7562         }
7563     }
7564
7565   coding->consumed = src - coding->source;
7566   coding->consumed_char = pos - coding->src_pos;
7567   coding->charbuf_used = buf - coding->charbuf;
7568   coding->chars_at_source = 0;
7569 }
7570
7571
7572 /* Encode the text at CODING->src_object into CODING->dst_object.
7573    CODING->src_object is a buffer or a string.
7574    CODING->dst_object is a buffer or nil.
7575
7576    If CODING->src_object is a buffer, it must be the current buffer.
7577    In this case, if CODING->src_pos is positive, it is a position of
7578    the source text in the buffer, otherwise. the source text is in the
7579    gap area of the buffer, and coding->src_pos specifies the offset of
7580    the text from GPT (which must be the same as PT).  If this is the
7581    same buffer as CODING->dst_object, CODING->src_pos must be
7582    negative and CODING should not have `pre-write-conversion'.
7583
7584    If CODING->src_object is a string, CODING should not have
7585    `pre-write-conversion'.
7586
7587    If CODING->dst_object is a buffer, the encoded data is inserted at
7588    the current point of that buffer.
7589
7590    If CODING->dst_object is nil, the encoded data is placed at the
7591    memory area specified by CODING->destination.  */
7592
7593 static int
7594 encode_coding (coding)
7595      struct coding_system *coding;
7596 {
7597   Lisp_Object attrs;
7598   Lisp_Object translation_table;
7599   int max_lookup;
7600   struct ccl_spec cclspec;
7601
7602   attrs = CODING_ID_ATTRS (coding->id);
7603   if (coding->encoder == encode_coding_raw_text)
7604     translation_table = Qnil, max_lookup = 0;
7605   else
7606     translation_table = get_translation_table (attrs, 1, &max_lookup);
7607
7608   if (BUFFERP (coding->dst_object))
7609     {
7610       set_buffer_internal (XBUFFER (coding->dst_object));
7611       coding->dst_multibyte
7612         = ! NILP (current_buffer->enable_multibyte_characters);
7613     }
7614
7615   coding->consumed = coding->consumed_char = 0;
7616   coding->produced = coding->produced_char = 0;
7617   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7618   coding->errors = 0;
7619
7620   ALLOC_CONVERSION_WORK_AREA (coding);
7621
7622   if (coding->encoder == encode_coding_ccl)
7623     {
7624       coding->spec.ccl = &cclspec;
7625       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7626     }
7627   do {
7628     coding_set_source (coding);
7629     consume_chars (coding, translation_table, max_lookup);
7630     coding_set_destination (coding);
7631     (*(coding->encoder)) (coding);
7632   } while (coding->consumed_char < coding->src_chars);
7633
7634   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7635     insert_from_gap (coding->produced_char, coding->produced);
7636
7637   return (coding->result);
7638 }
7639
7640
7641 /* Name (or base name) of work buffer for code conversion.  */
7642 static Lisp_Object Vcode_conversion_workbuf_name;
7643
7644 /* A working buffer used by the top level conversion.  Once it is
7645    created, it is never destroyed.  It has the name
7646    Vcode_conversion_workbuf_name.  The other working buffers are
7647    destroyed after the use is finished, and their names are modified
7648    versions of Vcode_conversion_workbuf_name.  */
7649 static Lisp_Object Vcode_conversion_reused_workbuf;
7650
7651 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7652 static int reused_workbuf_in_use;
7653
7654
7655 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7656    multibyteness of returning buffer.  */
7657
7658 static Lisp_Object
7659 make_conversion_work_buffer (multibyte)
7660      int multibyte;
7661 {
7662   Lisp_Object name, workbuf;
7663   struct buffer *current;
7664
7665   if (reused_workbuf_in_use++)
7666     {
7667       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7668       workbuf = Fget_buffer_create (name);
7669     }
7670   else
7671     {
7672       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7673         Vcode_conversion_reused_workbuf
7674           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7675       workbuf = Vcode_conversion_reused_workbuf;
7676     }
7677   current = current_buffer;
7678   set_buffer_internal (XBUFFER (workbuf));
7679   /* We can't allow modification hooks to run in the work buffer.  For
7680      instance, directory_files_internal assumes that file decoding
7681      doesn't compile new regexps.  */
7682   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7683   Ferase_buffer ();
7684   current_buffer->undo_list = Qt;
7685   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7686   set_buffer_internal (current);
7687   return workbuf;
7688 }
7689
7690
7691 static Lisp_Object
7692 code_conversion_restore (arg)
7693      Lisp_Object arg;
7694 {
7695   Lisp_Object current, workbuf;
7696   struct gcpro gcpro1;
7697
7698   GCPRO1 (arg);
7699   current = XCAR (arg);
7700   workbuf = XCDR (arg);
7701   if (! NILP (workbuf))
7702     {
7703       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7704         reused_workbuf_in_use = 0;
7705       else if (! NILP (Fbuffer_live_p (workbuf)))
7706         Fkill_buffer (workbuf);
7707     }
7708   set_buffer_internal (XBUFFER (current));
7709   UNGCPRO;
7710   return Qnil;
7711 }
7712
7713 Lisp_Object
7714 code_conversion_save (with_work_buf, multibyte)
7715      int with_work_buf, multibyte;
7716 {
7717   Lisp_Object workbuf = Qnil;
7718
7719   if (with_work_buf)
7720     workbuf = make_conversion_work_buffer (multibyte);
7721   record_unwind_protect (code_conversion_restore,
7722                          Fcons (Fcurrent_buffer (), workbuf));
7723   return workbuf;
7724 }
7725
7726 int
7727 decode_coding_gap (coding, chars, bytes)
7728      struct coding_system *coding;
7729      EMACS_INT chars, bytes;
7730 {
7731   int count = specpdl_ptr - specpdl;
7732   Lisp_Object attrs;
7733
7734   code_conversion_save (0, 0);
7735
7736   coding->src_object = Fcurrent_buffer ();
7737   coding->src_chars = chars;
7738   coding->src_bytes = bytes;
7739   coding->src_pos = -chars;
7740   coding->src_pos_byte = -bytes;
7741   coding->src_multibyte = chars < bytes;
7742   coding->dst_object = coding->src_object;
7743   coding->dst_pos = PT;
7744   coding->dst_pos_byte = PT_BYTE;
7745   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7746
7747   if (CODING_REQUIRE_DETECTION (coding))
7748     detect_coding (coding);
7749
7750   coding->mode |= CODING_MODE_LAST_BLOCK;
7751   current_buffer->text->inhibit_shrinking = 1;
7752   decode_coding (coding);
7753   current_buffer->text->inhibit_shrinking = 0;
7754
7755   attrs = CODING_ID_ATTRS (coding->id);
7756   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7757     {
7758       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7759       Lisp_Object val;
7760
7761       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7762       val = call1 (CODING_ATTR_POST_READ (attrs),
7763                    make_number (coding->produced_char));
7764       CHECK_NATNUM (val);
7765       coding->produced_char += Z - prev_Z;
7766       coding->produced += Z_BYTE - prev_Z_BYTE;
7767     }
7768
7769   unbind_to (count, Qnil);
7770   return coding->result;
7771 }
7772
7773 int
7774 encode_coding_gap (coding, chars, bytes)
7775      struct coding_system *coding;
7776      EMACS_INT chars, bytes;
7777 {
7778   int count = specpdl_ptr - specpdl;
7779
7780   code_conversion_save (0, 0);
7781
7782   coding->src_object = Fcurrent_buffer ();
7783   coding->src_chars = chars;
7784   coding->src_bytes = bytes;
7785   coding->src_pos = -chars;
7786   coding->src_pos_byte = -bytes;
7787   coding->src_multibyte = chars < bytes;
7788   coding->dst_object = coding->src_object;
7789   coding->dst_pos = PT;
7790   coding->dst_pos_byte = PT_BYTE;
7791
7792   encode_coding (coding);
7793
7794   unbind_to (count, Qnil);
7795   return coding->result;
7796 }
7797
7798
7799 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7800    SRC_OBJECT into DST_OBJECT by coding context CODING.
7801
7802    SRC_OBJECT is a buffer, a string, or Qnil.
7803
7804    If it is a buffer, the text is at point of the buffer.  FROM and TO
7805    are positions in the buffer.
7806
7807    If it is a string, the text is at the beginning of the string.
7808    FROM and TO are indices to the string.
7809
7810    If it is nil, the text is at coding->source.  FROM and TO are
7811    indices to coding->source.
7812
7813    DST_OBJECT is a buffer, Qt, or Qnil.
7814
7815    If it is a buffer, the decoded text is inserted at point of the
7816    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7817    is deleted.
7818
7819    If it is Qt, a string is made from the decoded text, and
7820    set in CODING->dst_object.
7821
7822    If it is Qnil, the decoded text is stored at CODING->destination.
7823    The caller must allocate CODING->dst_bytes bytes at
7824    CODING->destination by xmalloc.  If the decoded text is longer than
7825    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7826  */
7827
7828 void
7829 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7830                       dst_object)
7831      struct coding_system *coding;
7832      Lisp_Object src_object;
7833      EMACS_INT from, from_byte, to, to_byte;
7834      Lisp_Object dst_object;
7835 {
7836   int count = specpdl_ptr - specpdl;
7837   unsigned char *destination;
7838   EMACS_INT dst_bytes;
7839   EMACS_INT chars = to - from;
7840   EMACS_INT bytes = to_byte - from_byte;
7841   Lisp_Object attrs;
7842   int saved_pt = -1, saved_pt_byte;
7843   int need_marker_adjustment = 0;
7844   Lisp_Object old_deactivate_mark;
7845
7846   old_deactivate_mark = Vdeactivate_mark;
7847
7848   if (NILP (dst_object))
7849     {
7850       destination = coding->destination;
7851       dst_bytes = coding->dst_bytes;
7852     }
7853
7854   coding->src_object = src_object;
7855   coding->src_chars = chars;
7856   coding->src_bytes = bytes;
7857   coding->src_multibyte = chars < bytes;
7858
7859   if (STRINGP (src_object))
7860     {
7861       coding->src_pos = from;
7862       coding->src_pos_byte = from_byte;
7863     }
7864   else if (BUFFERP (src_object))
7865     {
7866       set_buffer_internal (XBUFFER (src_object));
7867       if (from != GPT)
7868         move_gap_both (from, from_byte);
7869       if (EQ (src_object, dst_object))
7870         {
7871           struct Lisp_Marker *tail;
7872
7873           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7874             {
7875               tail->need_adjustment
7876                 = tail->charpos == (tail->insertion_type ? from : to);
7877               need_marker_adjustment |= tail->need_adjustment;
7878             }
7879           saved_pt = PT, saved_pt_byte = PT_BYTE;
7880           TEMP_SET_PT_BOTH (from, from_byte);
7881           current_buffer->text->inhibit_shrinking = 1;
7882           del_range_both (from, from_byte, to, to_byte, 1);
7883           coding->src_pos = -chars;
7884           coding->src_pos_byte = -bytes;
7885         }
7886       else
7887         {
7888           coding->src_pos = from;
7889           coding->src_pos_byte = from_byte;
7890         }
7891     }
7892
7893   if (CODING_REQUIRE_DETECTION (coding))
7894     detect_coding (coding);
7895   attrs = CODING_ID_ATTRS (coding->id);
7896
7897   if (EQ (dst_object, Qt)
7898       || (! NILP (CODING_ATTR_POST_READ (attrs))
7899           && NILP (dst_object)))
7900     {
7901       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7902       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7903       coding->dst_pos = BEG;
7904       coding->dst_pos_byte = BEG_BYTE;
7905     }
7906   else if (BUFFERP (dst_object))
7907     {
7908       code_conversion_save (0, 0);
7909       coding->dst_object = dst_object;
7910       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7911       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7912       coding->dst_multibyte
7913         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7914     }
7915   else
7916     {
7917       code_conversion_save (0, 0);
7918       coding->dst_object = Qnil;
7919       /* Most callers presume this will return a multibyte result, and they
7920          won't use `binary' or `raw-text' anyway, so let's not worry about
7921          CODING_FOR_UNIBYTE.  */
7922       coding->dst_multibyte = 1;
7923     }
7924
7925   decode_coding (coding);
7926
7927   if (BUFFERP (coding->dst_object))
7928     set_buffer_internal (XBUFFER (coding->dst_object));
7929
7930   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7931     {
7932       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7933       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7934       Lisp_Object val;
7935
7936       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7937       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7938               old_deactivate_mark);
7939       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7940                         make_number (coding->produced_char));
7941       UNGCPRO;
7942       CHECK_NATNUM (val);
7943       coding->produced_char += Z - prev_Z;
7944       coding->produced += Z_BYTE - prev_Z_BYTE;
7945     }
7946
7947   if (EQ (dst_object, Qt))
7948     {
7949       coding->dst_object = Fbuffer_string ();
7950     }
7951   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7952     {
7953       set_buffer_internal (XBUFFER (coding->dst_object));
7954       if (dst_bytes < coding->produced)
7955         {
7956           destination = xrealloc (destination, coding->produced);
7957           if (! destination)
7958             {
7959               record_conversion_result (coding,
7960                                         CODING_RESULT_INSUFFICIENT_MEM);
7961               unbind_to (count, Qnil);
7962               return;
7963             }
7964           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7965             move_gap_both (BEGV, BEGV_BYTE);
7966           bcopy (BEGV_ADDR, destination, coding->produced);
7967           coding->destination = destination;
7968         }
7969     }
7970
7971   if (saved_pt >= 0)
7972     {
7973       /* This is the case of:
7974          (BUFFERP (src_object) && EQ (src_object, dst_object))
7975          As we have moved PT while replacing the original buffer
7976          contents, we must recover it now.  */
7977       set_buffer_internal (XBUFFER (src_object));
7978       current_buffer->text->inhibit_shrinking = 0;
7979       if (saved_pt < from)
7980         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7981       else if (saved_pt < from + chars)
7982         TEMP_SET_PT_BOTH (from, from_byte);
7983       else if (! NILP (current_buffer->enable_multibyte_characters))
7984         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7985                           saved_pt_byte + (coding->produced - bytes));
7986       else
7987         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7988                           saved_pt_byte + (coding->produced - bytes));
7989
7990       if (need_marker_adjustment)
7991         {
7992           struct Lisp_Marker *tail;
7993
7994           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7995             if (tail->need_adjustment)
7996               {
7997                 tail->need_adjustment = 0;
7998                 if (tail->insertion_type)
7999                   {
8000                     tail->bytepos = from_byte;
8001                     tail->charpos = from;
8002                   }
8003                 else
8004                   {
8005                     tail->bytepos = from_byte + coding->produced;
8006                     tail->charpos
8007                       = (NILP (current_buffer->enable_multibyte_characters)
8008                          ? tail->bytepos : from + coding->produced_char);
8009                   }
8010               }
8011         }
8012     }
8013
8014   Vdeactivate_mark = old_deactivate_mark;
8015   unbind_to (count, coding->dst_object);
8016 }
8017
8018
8019 void
8020 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
8021                       dst_object)
8022      struct coding_system *coding;
8023      Lisp_Object src_object;
8024      EMACS_INT from, from_byte, to, to_byte;
8025      Lisp_Object dst_object;
8026 {
8027   int count = specpdl_ptr - specpdl;
8028   EMACS_INT chars = to - from;
8029   EMACS_INT bytes = to_byte - from_byte;
8030   Lisp_Object attrs;
8031   int saved_pt = -1, saved_pt_byte;
8032   int need_marker_adjustment = 0;
8033   int kill_src_buffer = 0;
8034   Lisp_Object old_deactivate_mark;
8035
8036   old_deactivate_mark = Vdeactivate_mark;
8037
8038   coding->src_object = src_object;
8039   coding->src_chars = chars;
8040   coding->src_bytes = bytes;
8041   coding->src_multibyte = chars < bytes;
8042
8043   attrs = CODING_ID_ATTRS (coding->id);
8044
8045   if (EQ (src_object, dst_object))
8046     {
8047       struct Lisp_Marker *tail;
8048
8049       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8050         {
8051           tail->need_adjustment
8052             = tail->charpos == (tail->insertion_type ? from : to);
8053           need_marker_adjustment |= tail->need_adjustment;
8054         }
8055     }
8056
8057   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8058     {
8059       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8060       set_buffer_internal (XBUFFER (coding->src_object));
8061       if (STRINGP (src_object))
8062         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8063       else if (BUFFERP (src_object))
8064         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8065       else
8066         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
8067
8068       if (EQ (src_object, dst_object))
8069         {
8070           set_buffer_internal (XBUFFER (src_object));
8071           saved_pt = PT, saved_pt_byte = PT_BYTE;
8072           del_range_both (from, from_byte, to, to_byte, 1);
8073           set_buffer_internal (XBUFFER (coding->src_object));
8074         }
8075
8076       {
8077         Lisp_Object args[3];
8078         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8079
8080         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8081                 old_deactivate_mark);
8082         args[0] = CODING_ATTR_PRE_WRITE (attrs);
8083         args[1] = make_number (BEG);
8084         args[2] = make_number (Z);
8085         safe_call (3, args);
8086         UNGCPRO;
8087       }
8088       if (XBUFFER (coding->src_object) != current_buffer)
8089         kill_src_buffer = 1;
8090       coding->src_object = Fcurrent_buffer ();
8091       if (BEG != GPT)
8092         move_gap_both (BEG, BEG_BYTE);
8093       coding->src_chars = Z - BEG;
8094       coding->src_bytes = Z_BYTE - BEG_BYTE;
8095       coding->src_pos = BEG;
8096       coding->src_pos_byte = BEG_BYTE;
8097       coding->src_multibyte = Z < Z_BYTE;
8098     }
8099   else if (STRINGP (src_object))
8100     {
8101       code_conversion_save (0, 0);
8102       coding->src_pos = from;
8103       coding->src_pos_byte = from_byte;
8104     }
8105   else if (BUFFERP (src_object))
8106     {
8107       code_conversion_save (0, 0);
8108       set_buffer_internal (XBUFFER (src_object));
8109       if (EQ (src_object, dst_object))
8110         {
8111           saved_pt = PT, saved_pt_byte = PT_BYTE;
8112           coding->src_object = del_range_1 (from, to, 1, 1);
8113           coding->src_pos = 0;
8114           coding->src_pos_byte = 0;
8115         }
8116       else
8117         {
8118           if (from < GPT && to >= GPT)
8119             move_gap_both (from, from_byte);
8120           coding->src_pos = from;
8121           coding->src_pos_byte = from_byte;
8122         }
8123     }
8124   else
8125     code_conversion_save (0, 0);
8126
8127   if (BUFFERP (dst_object))
8128     {
8129       coding->dst_object = dst_object;
8130       if (EQ (src_object, dst_object))
8131         {
8132           coding->dst_pos = from;
8133           coding->dst_pos_byte = from_byte;
8134         }
8135       else
8136         {
8137           struct buffer *current = current_buffer;
8138
8139           set_buffer_temp (XBUFFER (dst_object));
8140           coding->dst_pos = PT;
8141           coding->dst_pos_byte = PT_BYTE;
8142           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8143           set_buffer_temp (current);
8144         }
8145       coding->dst_multibyte
8146         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8147     }
8148   else if (EQ (dst_object, Qt))
8149     {
8150       coding->dst_object = Qnil;
8151       coding->dst_bytes = coding->src_chars;
8152       if (coding->dst_bytes == 0)
8153         coding->dst_bytes = 1;
8154       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8155       coding->dst_multibyte = 0;
8156     }
8157   else
8158     {
8159       coding->dst_object = Qnil;
8160       coding->dst_multibyte = 0;
8161     }
8162
8163   encode_coding (coding);
8164
8165   if (EQ (dst_object, Qt))
8166     {
8167       if (BUFFERP (coding->dst_object))
8168         coding->dst_object = Fbuffer_string ();
8169       else
8170         {
8171           coding->dst_object
8172             = make_unibyte_string ((char *) coding->destination,
8173                                    coding->produced);
8174           xfree (coding->destination);
8175         }
8176     }
8177
8178   if (saved_pt >= 0)
8179     {
8180       /* This is the case of:
8181          (BUFFERP (src_object) && EQ (src_object, dst_object))
8182          As we have moved PT while replacing the original buffer
8183          contents, we must recover it now.  */
8184       set_buffer_internal (XBUFFER (src_object));
8185       if (saved_pt < from)
8186         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8187       else if (saved_pt < from + chars)
8188         TEMP_SET_PT_BOTH (from, from_byte);
8189       else if (! NILP (current_buffer->enable_multibyte_characters))
8190         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8191                           saved_pt_byte + (coding->produced - bytes));
8192       else
8193         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8194                           saved_pt_byte + (coding->produced - bytes));
8195
8196       if (need_marker_adjustment)
8197         {
8198           struct Lisp_Marker *tail;
8199
8200           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8201             if (tail->need_adjustment)
8202               {
8203                 tail->need_adjustment = 0;
8204                 if (tail->insertion_type)
8205                   {
8206                     tail->bytepos = from_byte;
8207                     tail->charpos = from;
8208                   }
8209                 else
8210                   {
8211                     tail->bytepos = from_byte + coding->produced;
8212                     tail->charpos
8213                       = (NILP (current_buffer->enable_multibyte_characters)
8214                          ? tail->bytepos : from + coding->produced_char);
8215                   }
8216               }
8217         }
8218     }
8219
8220   if (kill_src_buffer)
8221     Fkill_buffer (coding->src_object);
8222
8223   Vdeactivate_mark = old_deactivate_mark;
8224   unbind_to (count, Qnil);
8225 }
8226
8227
8228 Lisp_Object
8229 preferred_coding_system ()
8230 {
8231   int id = coding_categories[coding_priorities[0]].id;
8232
8233   return CODING_ID_NAME (id);
8234 }
8235
8236 \f
8237 #ifdef emacs
8238 /*** 8. Emacs Lisp library functions ***/
8239
8240 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8241        doc: /* Return t if OBJECT is nil or a coding-system.
8242 See the documentation of `define-coding-system' for information
8243 about coding-system objects.  */)
8244      (object)
8245      Lisp_Object object;
8246 {
8247   if (NILP (object)
8248       || CODING_SYSTEM_ID (object) >= 0)
8249     return Qt;
8250   if (! SYMBOLP (object)
8251       || NILP (Fget (object, Qcoding_system_define_form)))
8252     return Qnil;
8253   return Qt;
8254 }
8255
8256 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8257        Sread_non_nil_coding_system, 1, 1, 0,
8258        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8259      (prompt)
8260      Lisp_Object prompt;
8261 {
8262   Lisp_Object val;
8263   do
8264     {
8265       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8266                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8267     }
8268   while (SCHARS (val) == 0);
8269   return (Fintern (val, Qnil));
8270 }
8271
8272 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8273        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8274 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8275 Ignores case when completing coding systems (all Emacs coding systems
8276 are lower-case).  */)
8277      (prompt, default_coding_system)
8278      Lisp_Object prompt, default_coding_system;
8279 {
8280   Lisp_Object val;
8281   int count = SPECPDL_INDEX ();
8282
8283   if (SYMBOLP (default_coding_system))
8284     default_coding_system = SYMBOL_NAME (default_coding_system);
8285   specbind (Qcompletion_ignore_case, Qt);
8286   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8287                           Qt, Qnil, Qcoding_system_history,
8288                           default_coding_system, Qnil);
8289   unbind_to (count, Qnil);
8290   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8291 }
8292
8293 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8294        1, 1, 0,
8295        doc: /* Check validity of CODING-SYSTEM.
8296 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8297 It is valid if it is nil or a symbol defined as a coding system by the
8298 function `define-coding-system'.  */)
8299   (coding_system)
8300      Lisp_Object coding_system;
8301 {
8302   Lisp_Object define_form;
8303
8304   define_form = Fget (coding_system, Qcoding_system_define_form);
8305   if (! NILP (define_form))
8306     {
8307       Fput (coding_system, Qcoding_system_define_form, Qnil);
8308       safe_eval (define_form);
8309     }
8310   if (!NILP (Fcoding_system_p (coding_system)))
8311     return coding_system;
8312   xsignal1 (Qcoding_system_error, coding_system);
8313 }
8314
8315 \f
8316 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8317    HIGHEST is nonzero, return the coding system of the highest
8318    priority among the detected coding systems.  Otherwize return a
8319    list of detected coding systems sorted by their priorities.  If
8320    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8321    multibyte form but contains only ASCII and eight-bit chars.
8322    Otherwise, the bytes are raw bytes.
8323
8324    CODING-SYSTEM controls the detection as below:
8325
8326    If it is nil, detect both text-format and eol-format.  If the
8327    text-format part of CODING-SYSTEM is already specified
8328    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8329    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8330    detect only text-format.  */
8331
8332 Lisp_Object
8333 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8334                       coding_system)
8335      const unsigned char *src;
8336      EMACS_INT src_chars, src_bytes;
8337      int highest;
8338      int multibytep;
8339      Lisp_Object coding_system;
8340 {
8341   const unsigned char *src_end = src + src_bytes;
8342   Lisp_Object attrs, eol_type;
8343   Lisp_Object val = Qnil;
8344   struct coding_system coding;
8345   int id;
8346   struct coding_detection_info detect_info;
8347   enum coding_category base_category;
8348   int null_byte_found = 0, eight_bit_found = 0;
8349
8350   if (NILP (coding_system))
8351     coding_system = Qundecided;
8352   setup_coding_system (coding_system, &coding);
8353   attrs = CODING_ID_ATTRS (coding.id);
8354   eol_type = CODING_ID_EOL_TYPE (coding.id);
8355   coding_system = CODING_ATTR_BASE_NAME (attrs);
8356
8357   coding.source = src;
8358   coding.src_chars = src_chars;
8359   coding.src_bytes = src_bytes;
8360   coding.src_multibyte = multibytep;
8361   coding.consumed = 0;
8362   coding.mode |= CODING_MODE_LAST_BLOCK;
8363   coding.head_ascii = 0;
8364
8365   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8366
8367   /* At first, detect text-format if necessary.  */
8368   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8369   if (base_category == coding_category_undecided)
8370     {
8371       enum coding_category category;
8372       struct coding_system *this;
8373       int c, i;
8374
8375       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8376       for (; src < src_end; src++)
8377         {
8378           c = *src;
8379           if (c & 0x80)
8380             {
8381               eight_bit_found = 1;
8382               if (null_byte_found)
8383                 break;
8384             }
8385           else if (c < 0x20)
8386             {
8387               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8388                   && ! inhibit_iso_escape_detection
8389                   && ! detect_info.checked)
8390                 {
8391                   if (detect_coding_iso_2022 (&coding, &detect_info))
8392                     {
8393                       /* We have scanned the whole data.  */
8394                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8395                         {
8396                           /* We didn't find an 8-bit code.  We may
8397                              have found a null-byte, but it's very
8398                              rare that a binary file confirm to
8399                              ISO-2022.  */
8400                           src = src_end;
8401                           coding.head_ascii = src - coding.source;
8402                         }
8403                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8404                       break;
8405                     }
8406                 }
8407               else if (! c && !inhibit_null_byte_detection)
8408                 {
8409                   null_byte_found = 1;
8410                   if (eight_bit_found)
8411                     break;
8412                 }
8413               if (! eight_bit_found)
8414                 coding.head_ascii++;
8415             }
8416           else if (! eight_bit_found)
8417             coding.head_ascii++;
8418         }
8419
8420       if (null_byte_found || eight_bit_found
8421           || coding.head_ascii < coding.src_bytes
8422           || detect_info.found)
8423         {
8424           if (coding.head_ascii == coding.src_bytes)
8425             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8426             for (i = 0; i < coding_category_raw_text; i++)
8427               {
8428                 category = coding_priorities[i];
8429                 this = coding_categories + category;
8430                 if (detect_info.found & (1 << category))
8431                   break;
8432               }
8433           else
8434             {
8435               if (null_byte_found)
8436                 {
8437                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8438                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8439                 }
8440               for (i = 0; i < coding_category_raw_text; i++)
8441                 {
8442                   category = coding_priorities[i];
8443                   this = coding_categories + category;
8444
8445                   if (this->id < 0)
8446                     {
8447                       /* No coding system of this category is defined.  */
8448                       detect_info.rejected |= (1 << category);
8449                     }
8450                   else if (category >= coding_category_raw_text)
8451                     continue;
8452                   else if (detect_info.checked & (1 << category))
8453                     {
8454                       if (highest
8455                           && (detect_info.found & (1 << category)))
8456                         break;
8457                     }
8458                   else if ((*(this->detector)) (&coding, &detect_info)
8459                            && highest
8460                            && (detect_info.found & (1 << category)))
8461                     {
8462                       if (category == coding_category_utf_16_auto)
8463                         {
8464                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8465                             category = coding_category_utf_16_le;
8466                           else
8467                             category = coding_category_utf_16_be;
8468                         }
8469                       break;
8470                     }
8471                 }
8472             }
8473         }
8474
8475       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8476           || null_byte_found)
8477         {
8478           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8479           id = CODING_SYSTEM_ID (Qno_conversion);
8480           val = Fcons (make_number (id), Qnil);
8481         }
8482       else if (! detect_info.rejected && ! detect_info.found)
8483         {
8484           detect_info.found = CATEGORY_MASK_ANY;
8485           id = coding_categories[coding_category_undecided].id;
8486           val = Fcons (make_number (id), Qnil);
8487         }
8488       else if (highest)
8489         {
8490           if (detect_info.found)
8491             {
8492               detect_info.found = 1 << category;
8493               val = Fcons (make_number (this->id), Qnil);
8494             }
8495           else
8496             for (i = 0; i < coding_category_raw_text; i++)
8497               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8498                 {
8499                   detect_info.found = 1 << coding_priorities[i];
8500                   id = coding_categories[coding_priorities[i]].id;
8501                   val = Fcons (make_number (id), Qnil);
8502                   break;
8503                 }
8504         }
8505       else
8506         {
8507           int mask = detect_info.rejected | detect_info.found;
8508           int found = 0;
8509
8510           for (i = coding_category_raw_text - 1; i >= 0; i--)
8511             {
8512               category = coding_priorities[i];
8513               if (! (mask & (1 << category)))
8514                 {
8515                   found |= 1 << category;
8516                   id = coding_categories[category].id;
8517                   if (id >= 0)
8518                     val = Fcons (make_number (id), val);
8519                 }
8520             }
8521           for (i = coding_category_raw_text - 1; i >= 0; i--)
8522             {
8523               category = coding_priorities[i];
8524               if (detect_info.found & (1 << category))
8525                 {
8526                   id = coding_categories[category].id;
8527                   val = Fcons (make_number (id), val);
8528                 }
8529             }
8530           detect_info.found |= found;
8531         }
8532     }
8533   else if (base_category == coding_category_utf_8_auto)
8534     {
8535       if (detect_coding_utf_8 (&coding, &detect_info))
8536         {
8537           struct coding_system *this;
8538
8539           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8540             this = coding_categories + coding_category_utf_8_sig;
8541           else
8542             this = coding_categories + coding_category_utf_8_nosig;
8543           val = Fcons (make_number (this->id), Qnil);
8544         }
8545     }
8546   else if (base_category == coding_category_utf_16_auto)
8547     {
8548       if (detect_coding_utf_16 (&coding, &detect_info))
8549         {
8550           struct coding_system *this;
8551
8552           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8553             this = coding_categories + coding_category_utf_16_le;
8554           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8555             this = coding_categories + coding_category_utf_16_be;
8556           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8557             this = coding_categories + coding_category_utf_16_be_nosig;
8558           else
8559             this = coding_categories + coding_category_utf_16_le_nosig;
8560           val = Fcons (make_number (this->id), Qnil);
8561         }
8562     }
8563   else
8564     {
8565       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8566       val = Fcons (make_number (coding.id), Qnil);
8567     }
8568
8569   /* Then, detect eol-format if necessary.  */
8570   {
8571     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8572     Lisp_Object tail;
8573
8574     if (VECTORP (eol_type))
8575       {
8576         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8577           {
8578             if (null_byte_found)
8579               normal_eol = EOL_SEEN_LF;
8580             else
8581               normal_eol = detect_eol (coding.source, src_bytes,
8582                                        coding_category_raw_text);
8583           }
8584         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8585                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8586           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8587                                       coding_category_utf_16_be);
8588         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8589                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8590           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8591                                       coding_category_utf_16_le);
8592       }
8593     else
8594       {
8595         if (EQ (eol_type, Qunix))
8596           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8597         else if (EQ (eol_type, Qdos))
8598           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8599         else
8600           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8601       }
8602
8603     for (tail = val; CONSP (tail); tail = XCDR (tail))
8604       {
8605         enum coding_category category;
8606         int this_eol;
8607
8608         id = XINT (XCAR (tail));
8609         attrs = CODING_ID_ATTRS (id);
8610         category = XINT (CODING_ATTR_CATEGORY (attrs));
8611         eol_type = CODING_ID_EOL_TYPE (id);
8612         if (VECTORP (eol_type))
8613           {
8614             if (category == coding_category_utf_16_be
8615                 || category == coding_category_utf_16_be_nosig)
8616               this_eol = utf_16_be_eol;
8617             else if (category == coding_category_utf_16_le
8618                      || category == coding_category_utf_16_le_nosig)
8619               this_eol = utf_16_le_eol;
8620             else
8621               this_eol = normal_eol;
8622
8623             if (this_eol == EOL_SEEN_LF)
8624               XSETCAR (tail, AREF (eol_type, 0));
8625             else if (this_eol == EOL_SEEN_CRLF)
8626               XSETCAR (tail, AREF (eol_type, 1));
8627             else if (this_eol == EOL_SEEN_CR)
8628               XSETCAR (tail, AREF (eol_type, 2));
8629             else
8630               XSETCAR (tail, CODING_ID_NAME (id));
8631           }
8632         else
8633           XSETCAR (tail, CODING_ID_NAME (id));
8634       }
8635   }
8636
8637   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8638 }
8639
8640
8641 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8642        2, 3, 0,
8643        doc: /* Detect coding system of the text in the region between START and END.
8644 Return a list of possible coding systems ordered by priority.
8645 The coding systems to try and their priorities follows what
8646 the function `coding-system-priority-list' (which see) returns.
8647
8648 If only ASCII characters are found (except for such ISO-2022 control
8649 characters as ESC), it returns a list of single element `undecided'
8650 or its subsidiary coding system according to a detected end-of-line
8651 format.
8652
8653 If optional argument HIGHEST is non-nil, return the coding system of
8654 highest priority.  */)
8655      (start, end, highest)
8656      Lisp_Object start, end, highest;
8657 {
8658   int from, to;
8659   int from_byte, to_byte;
8660
8661   CHECK_NUMBER_COERCE_MARKER (start);
8662   CHECK_NUMBER_COERCE_MARKER (end);
8663
8664   validate_region (&start, &end);
8665   from = XINT (start), to = XINT (end);
8666   from_byte = CHAR_TO_BYTE (from);
8667   to_byte = CHAR_TO_BYTE (to);
8668
8669   if (from < GPT && to >= GPT)
8670     move_gap_both (to, to_byte);
8671
8672   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8673                                to - from, to_byte - from_byte,
8674                                !NILP (highest),
8675                                !NILP (current_buffer
8676                                       ->enable_multibyte_characters),
8677                                Qnil);
8678 }
8679
8680 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8681        1, 2, 0,
8682        doc: /* Detect coding system of the text in STRING.
8683 Return a list of possible coding systems ordered by priority.
8684 The coding systems to try and their priorities follows what
8685 the function `coding-system-priority-list' (which see) returns.
8686
8687 If only ASCII characters are found (except for such ISO-2022 control
8688 characters as ESC), it returns a list of single element `undecided'
8689 or its subsidiary coding system according to a detected end-of-line
8690 format.
8691
8692 If optional argument HIGHEST is non-nil, return the coding system of
8693 highest priority.  */)
8694      (string, highest)
8695      Lisp_Object string, highest;
8696 {
8697   CHECK_STRING (string);
8698
8699   return detect_coding_system (SDATA (string),
8700                                SCHARS (string), SBYTES (string),
8701                                !NILP (highest), STRING_MULTIBYTE (string),
8702                                Qnil);
8703 }
8704
8705
8706 static INLINE int
8707 char_encodable_p (c, attrs)
8708      int c;
8709      Lisp_Object attrs;
8710 {
8711   Lisp_Object tail;
8712   struct charset *charset;
8713   Lisp_Object translation_table;
8714
8715   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8716   if (! NILP (translation_table))
8717     c = translate_char (translation_table, c);
8718   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8719        CONSP (tail); tail = XCDR (tail))
8720     {
8721       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8722       if (CHAR_CHARSET_P (c, charset))
8723         break;
8724     }
8725   return (! NILP (tail));
8726 }
8727
8728
8729 /* Return a list of coding systems that safely encode the text between
8730    START and END.  If EXCLUDE is non-nil, it is a list of coding
8731    systems not to check.  The returned list doesn't contain any such
8732    coding systems.  In any case, if the text contains only ASCII or is
8733    unibyte, return t.  */
8734
8735 DEFUN ("find-coding-systems-region-internal",
8736        Ffind_coding_systems_region_internal,
8737        Sfind_coding_systems_region_internal, 2, 3, 0,
8738        doc: /* Internal use only.  */)
8739      (start, end, exclude)
8740      Lisp_Object start, end, exclude;
8741 {
8742   Lisp_Object coding_attrs_list, safe_codings;
8743   EMACS_INT start_byte, end_byte;
8744   const unsigned char *p, *pbeg, *pend;
8745   int c;
8746   Lisp_Object tail, elt, work_table;
8747
8748   if (STRINGP (start))
8749     {
8750       if (!STRING_MULTIBYTE (start)
8751           || SCHARS (start) == SBYTES (start))
8752         return Qt;
8753       start_byte = 0;
8754       end_byte = SBYTES (start);
8755     }
8756   else
8757     {
8758       CHECK_NUMBER_COERCE_MARKER (start);
8759       CHECK_NUMBER_COERCE_MARKER (end);
8760       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8761         args_out_of_range (start, end);
8762       if (NILP (current_buffer->enable_multibyte_characters))
8763         return Qt;
8764       start_byte = CHAR_TO_BYTE (XINT (start));
8765       end_byte = CHAR_TO_BYTE (XINT (end));
8766       if (XINT (end) - XINT (start) == end_byte - start_byte)
8767         return Qt;
8768
8769       if (XINT (start) < GPT && XINT (end) > GPT)
8770         {
8771           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8772             move_gap_both (XINT (start), start_byte);
8773           else
8774             move_gap_both (XINT (end), end_byte);
8775         }
8776     }
8777
8778   coding_attrs_list = Qnil;
8779   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8780     if (NILP (exclude)
8781         || NILP (Fmemq (XCAR (tail), exclude)))
8782       {
8783         Lisp_Object attrs;
8784
8785         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8786         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8787             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8788           {
8789             ASET (attrs, coding_attr_trans_tbl,
8790                   get_translation_table (attrs, 1, NULL));
8791             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8792           }
8793       }
8794
8795   if (STRINGP (start))
8796     p = pbeg = SDATA (start);
8797   else
8798     p = pbeg = BYTE_POS_ADDR (start_byte);
8799   pend = p + (end_byte - start_byte);
8800
8801   while (p < pend && ASCII_BYTE_P (*p)) p++;
8802   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8803
8804   work_table = Fmake_char_table (Qnil, Qnil);
8805   while (p < pend)
8806     {
8807       if (ASCII_BYTE_P (*p))
8808         p++;
8809       else
8810         {
8811           c = STRING_CHAR_ADVANCE (p);
8812           if (!NILP (char_table_ref (work_table, c)))
8813             /* This character was already checked.  Ignore it.  */
8814             continue;
8815
8816           charset_map_loaded = 0;
8817           for (tail = coding_attrs_list; CONSP (tail);)
8818             {
8819               elt = XCAR (tail);
8820               if (NILP (elt))
8821                 tail = XCDR (tail);
8822               else if (char_encodable_p (c, elt))
8823                 tail = XCDR (tail);
8824               else if (CONSP (XCDR (tail)))
8825                 {
8826                   XSETCAR (tail, XCAR (XCDR (tail)));
8827                   XSETCDR (tail, XCDR (XCDR (tail)));
8828                 }
8829               else
8830                 {
8831                   XSETCAR (tail, Qnil);
8832                   tail = XCDR (tail);
8833                 }
8834             }
8835           if (charset_map_loaded)
8836             {
8837               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8838
8839               if (STRINGP (start))
8840                 pbeg = SDATA (start);
8841               else
8842                 pbeg = BYTE_POS_ADDR (start_byte);
8843               p = pbeg + p_offset;
8844               pend = pbeg + pend_offset;
8845             }
8846           char_table_set (work_table, c, Qt);
8847         }
8848     }
8849
8850   safe_codings = list2 (Qraw_text, Qno_conversion);
8851   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8852     if (! NILP (XCAR (tail)))
8853       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8854
8855   return safe_codings;
8856 }
8857
8858
8859 DEFUN ("unencodable-char-position", Funencodable_char_position,
8860        Sunencodable_char_position, 3, 5, 0,
8861        doc: /*
8862 Return position of first un-encodable character in a region.
8863 START and END specify the region and CODING-SYSTEM specifies the
8864 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8865
8866 If optional 4th argument COUNT is non-nil, it specifies at most how
8867 many un-encodable characters to search.  In this case, the value is a
8868 list of positions.
8869
8870 If optional 5th argument STRING is non-nil, it is a string to search
8871 for un-encodable characters.  In that case, START and END are indexes
8872 to the string.  */)
8873      (start, end, coding_system, count, string)
8874      Lisp_Object start, end, coding_system, count, string;
8875 {
8876   int n;
8877   struct coding_system coding;
8878   Lisp_Object attrs, charset_list, translation_table;
8879   Lisp_Object positions;
8880   int from, to;
8881   const unsigned char *p, *stop, *pend;
8882   int ascii_compatible;
8883
8884   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8885   attrs = CODING_ID_ATTRS (coding.id);
8886   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8887     return Qnil;
8888   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8889   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8890   translation_table = get_translation_table (attrs, 1, NULL);
8891
8892   if (NILP (string))
8893     {
8894       validate_region (&start, &end);
8895       from = XINT (start);
8896       to = XINT (end);
8897       if (NILP (current_buffer->enable_multibyte_characters)
8898           || (ascii_compatible
8899               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8900         return Qnil;
8901       p = CHAR_POS_ADDR (from);
8902       pend = CHAR_POS_ADDR (to);
8903       if (from < GPT && to >= GPT)
8904         stop = GPT_ADDR;
8905       else
8906         stop = pend;
8907     }
8908   else
8909     {
8910       CHECK_STRING (string);
8911       CHECK_NATNUM (start);
8912       CHECK_NATNUM (end);
8913       from = XINT (start);
8914       to = XINT (end);
8915       if (from > to
8916           || to > SCHARS (string))
8917         args_out_of_range_3 (string, start, end);
8918       if (! STRING_MULTIBYTE (string))
8919         return Qnil;
8920       p = SDATA (string) + string_char_to_byte (string, from);
8921       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8922       if (ascii_compatible && (to - from) == (pend - p))
8923         return Qnil;
8924     }
8925
8926   if (NILP (count))
8927     n = 1;
8928   else
8929     {
8930       CHECK_NATNUM (count);
8931       n = XINT (count);
8932     }
8933
8934   positions = Qnil;
8935   while (1)
8936     {
8937       int c;
8938
8939       if (ascii_compatible)
8940         while (p < stop && ASCII_BYTE_P (*p))
8941           p++, from++;
8942       if (p >= stop)
8943         {
8944           if (p >= pend)
8945             break;
8946           stop = pend;
8947           p = GAP_END_ADDR;
8948         }
8949
8950       c = STRING_CHAR_ADVANCE (p);
8951       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8952           && ! char_charset (translate_char (translation_table, c),
8953                              charset_list, NULL))
8954         {
8955           positions = Fcons (make_number (from), positions);
8956           n--;
8957           if (n == 0)
8958             break;
8959         }
8960
8961       from++;
8962     }
8963
8964   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8965 }
8966
8967
8968 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8969        Scheck_coding_systems_region, 3, 3, 0,
8970        doc: /* Check if the region is encodable by coding systems.
8971
8972 START and END are buffer positions specifying the region.
8973 CODING-SYSTEM-LIST is a list of coding systems to check.
8974
8975 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8976 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8977 whole region, POS0, POS1, ... are buffer positions where non-encodable
8978 characters are found.
8979
8980 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8981 value is nil.
8982
8983 START may be a string.  In that case, check if the string is
8984 encodable, and the value contains indices to the string instead of
8985 buffer positions.  END is ignored.
8986
8987 If the current buffer (or START if it is a string) is unibyte, the value
8988 is nil.  */)
8989      (start, end, coding_system_list)
8990      Lisp_Object start, end, coding_system_list;
8991 {
8992   Lisp_Object list;
8993   EMACS_INT start_byte, end_byte;
8994   int pos;
8995   const unsigned char *p, *pbeg, *pend;
8996   int c;
8997   Lisp_Object tail, elt, attrs;
8998
8999   if (STRINGP (start))
9000     {
9001       if (!STRING_MULTIBYTE (start)
9002           || SCHARS (start) == SBYTES (start))
9003         return Qnil;
9004       start_byte = 0;
9005       end_byte = SBYTES (start);
9006       pos = 0;
9007     }
9008   else
9009     {
9010       CHECK_NUMBER_COERCE_MARKER (start);
9011       CHECK_NUMBER_COERCE_MARKER (end);
9012       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9013         args_out_of_range (start, end);
9014       if (NILP (current_buffer->enable_multibyte_characters))
9015         return Qnil;
9016       start_byte = CHAR_TO_BYTE (XINT (start));
9017       end_byte = CHAR_TO_BYTE (XINT (end));
9018       if (XINT (end) - XINT (start) == end_byte - start_byte)
9019         return Qnil;
9020
9021       if (XINT (start) < GPT && XINT (end) > GPT)
9022         {
9023           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9024             move_gap_both (XINT (start), start_byte);
9025           else
9026             move_gap_both (XINT (end), end_byte);
9027         }
9028       pos = XINT (start);
9029     }
9030
9031   list = Qnil;
9032   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9033     {
9034       elt = XCAR (tail);
9035       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9036       ASET (attrs, coding_attr_trans_tbl,
9037             get_translation_table (attrs, 1, NULL));
9038       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9039     }
9040
9041   if (STRINGP (start))
9042     p = pbeg = SDATA (start);
9043   else
9044     p = pbeg = BYTE_POS_ADDR (start_byte);
9045   pend = p + (end_byte - start_byte);
9046
9047   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9048   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9049
9050   while (p < pend)
9051     {
9052       if (ASCII_BYTE_P (*p))
9053         p++;
9054       else
9055         {
9056           c = STRING_CHAR_ADVANCE (p);
9057
9058           charset_map_loaded = 0;
9059           for (tail = list; CONSP (tail); tail = XCDR (tail))
9060             {
9061               elt = XCDR (XCAR (tail));
9062               if (! char_encodable_p (c, XCAR (elt)))
9063                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9064             }
9065           if (charset_map_loaded)
9066             {
9067               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
9068
9069               if (STRINGP (start))
9070                 pbeg = SDATA (start);
9071               else
9072                 pbeg = BYTE_POS_ADDR (start_byte);
9073               p = pbeg + p_offset;
9074               pend = pbeg + pend_offset;
9075             }
9076         }
9077       pos++;
9078     }
9079
9080   tail = list;
9081   list = Qnil;
9082   for (; CONSP (tail); tail = XCDR (tail))
9083     {
9084       elt = XCAR (tail);
9085       if (CONSP (XCDR (XCDR (elt))))
9086         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9087                       list);
9088     }
9089
9090   return list;
9091 }
9092
9093
9094 Lisp_Object
9095 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9096      Lisp_Object start, end, coding_system, dst_object;
9097      int encodep, norecord;
9098 {
9099   struct coding_system coding;
9100   EMACS_INT from, from_byte, to, to_byte;
9101   Lisp_Object src_object;
9102
9103   CHECK_NUMBER_COERCE_MARKER (start);
9104   CHECK_NUMBER_COERCE_MARKER (end);
9105   if (NILP (coding_system))
9106     coding_system = Qno_conversion;
9107   else
9108     CHECK_CODING_SYSTEM (coding_system);
9109   src_object = Fcurrent_buffer ();
9110   if (NILP (dst_object))
9111     dst_object = src_object;
9112   else if (! EQ (dst_object, Qt))
9113     CHECK_BUFFER (dst_object);
9114
9115   validate_region (&start, &end);
9116   from = XFASTINT (start);
9117   from_byte = CHAR_TO_BYTE (from);
9118   to = XFASTINT (end);
9119   to_byte = CHAR_TO_BYTE (to);
9120
9121   setup_coding_system (coding_system, &coding);
9122   coding.mode |= CODING_MODE_LAST_BLOCK;
9123
9124   if (encodep)
9125     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9126                           dst_object);
9127   else
9128     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9129                           dst_object);
9130   if (! norecord)
9131     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9132
9133   return (BUFFERP (dst_object)
9134           ? make_number (coding.produced_char)
9135           : coding.dst_object);
9136 }
9137
9138
9139 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9140        3, 4, "r\nzCoding system: ",
9141        doc: /* Decode the current region from the specified coding system.
9142 When called from a program, takes four arguments:
9143         START, END, CODING-SYSTEM, and DESTINATION.
9144 START and END are buffer positions.
9145
9146 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9147 If nil, the region between START and END is replaced by the decoded text.
9148 If buffer, the decoded text is inserted in that buffer after point (point
9149 does not move).
9150 In those cases, the length of the decoded text is returned.
9151 If DESTINATION is t, the decoded text is returned.
9152
9153 This function sets `last-coding-system-used' to the precise coding system
9154 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9155 not fully specified.)  */)
9156      (start, end, coding_system, destination)
9157      Lisp_Object start, end, coding_system, destination;
9158 {
9159   return code_convert_region (start, end, coding_system, destination, 0, 0);
9160 }
9161
9162 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9163        3, 4, "r\nzCoding system: ",
9164        doc: /* Encode the current region by specified coding system.
9165 When called from a program, takes four arguments:
9166         START, END, CODING-SYSTEM and DESTINATION.
9167 START and END are buffer positions.
9168
9169 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9170 If nil, the region between START and END is replace by the encoded text.
9171 If buffer, the encoded text is inserted in that buffer after point (point
9172 does not move).
9173 In those cases, the length of the encoded text is returned.
9174 If DESTINATION is t, the encoded text is returned.
9175
9176 This function sets `last-coding-system-used' to the precise coding system
9177 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9178 not fully specified.)  */)
9179   (start, end, coding_system, destination)
9180      Lisp_Object start, end, coding_system, destination;
9181 {
9182   return code_convert_region (start, end, coding_system, destination, 1, 0);
9183 }
9184
9185 Lisp_Object
9186 code_convert_string (string, coding_system, dst_object,
9187                      encodep, nocopy, norecord)
9188      Lisp_Object string, coding_system, dst_object;
9189      int encodep, nocopy, norecord;
9190 {
9191   struct coding_system coding;
9192   EMACS_INT chars, bytes;
9193
9194   CHECK_STRING (string);
9195   if (NILP (coding_system))
9196     {
9197       if (! norecord)
9198         Vlast_coding_system_used = Qno_conversion;
9199       if (NILP (dst_object))
9200         return (nocopy ? Fcopy_sequence (string) : string);
9201     }
9202
9203   if (NILP (coding_system))
9204     coding_system = Qno_conversion;
9205   else
9206     CHECK_CODING_SYSTEM (coding_system);
9207   if (NILP (dst_object))
9208     dst_object = Qt;
9209   else if (! EQ (dst_object, Qt))
9210     CHECK_BUFFER (dst_object);
9211
9212   setup_coding_system (coding_system, &coding);
9213   coding.mode |= CODING_MODE_LAST_BLOCK;
9214   chars = SCHARS (string);
9215   bytes = SBYTES (string);
9216   if (encodep)
9217     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9218   else
9219     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9220   if (! norecord)
9221     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9222
9223   return (BUFFERP (dst_object)
9224           ? make_number (coding.produced_char)
9225           : coding.dst_object);
9226 }
9227
9228
9229 /* Encode or decode STRING according to CODING_SYSTEM.
9230    Do not set Vlast_coding_system_used.
9231
9232    This function is called only from macros DECODE_FILE and
9233    ENCODE_FILE, thus we ignore character composition.  */
9234
9235 Lisp_Object
9236 code_convert_string_norecord (string, coding_system, encodep)
9237      Lisp_Object string, coding_system;
9238      int encodep;
9239 {
9240   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9241 }
9242
9243
9244 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9245        2, 4, 0,
9246        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9247
9248 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9249 if the decoding operation is trivial.
9250
9251 Optional fourth arg BUFFER non-nil means that the decoded text is
9252 inserted in that buffer after point (point does not move).  In this
9253 case, the return value is the length of the decoded text.
9254
9255 This function sets `last-coding-system-used' to the precise coding system
9256 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9257 not fully specified.)  */)
9258   (string, coding_system, nocopy, buffer)
9259      Lisp_Object string, coding_system, nocopy, buffer;
9260 {
9261   return code_convert_string (string, coding_system, buffer,
9262                               0, ! NILP (nocopy), 0);
9263 }
9264
9265 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9266        2, 4, 0,
9267        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9268
9269 Optional third arg NOCOPY non-nil means it is OK to return STRING
9270 itself if the encoding operation is trivial.
9271
9272 Optional fourth arg BUFFER non-nil means that the encoded text is
9273 inserted in that buffer after point (point does not move).  In this
9274 case, the return value is the length of the encoded text.
9275
9276 This function sets `last-coding-system-used' to the precise coding system
9277 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9278 not fully specified.)  */)
9279      (string, coding_system, nocopy, buffer)
9280      Lisp_Object string, coding_system, nocopy, buffer;
9281 {
9282   return code_convert_string (string, coding_system, buffer,
9283                               1, ! NILP (nocopy), 1);
9284 }
9285
9286 \f
9287 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9288        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9289 Return the corresponding character.  */)
9290      (code)
9291      Lisp_Object code;
9292 {
9293   Lisp_Object spec, attrs, val;
9294   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9295   int c;
9296
9297   CHECK_NATNUM (code);
9298   c = XFASTINT (code);
9299   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9300   attrs = AREF (spec, 0);
9301
9302   if (ASCII_BYTE_P (c)
9303       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9304     return code;
9305
9306   val = CODING_ATTR_CHARSET_LIST (attrs);
9307   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9308   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9309   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9310
9311   if (c <= 0x7F)
9312     charset = charset_roman;
9313   else if (c >= 0xA0 && c < 0xDF)
9314     {
9315       charset = charset_kana;
9316       c -= 0x80;
9317     }
9318   else
9319     {
9320       int s1 = c >> 8, s2 = c & 0xFF;
9321
9322       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9323           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9324         error ("Invalid code: %d", code);
9325       SJIS_TO_JIS (c);
9326       charset = charset_kanji;
9327     }
9328   c = DECODE_CHAR (charset, c);
9329   if (c < 0)
9330     error ("Invalid code: %d", code);
9331   return make_number (c);
9332 }
9333
9334
9335 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9336        doc: /* Encode a Japanese character CH to shift_jis encoding.
9337 Return the corresponding code in SJIS.  */)
9338      (ch)
9339     Lisp_Object ch;
9340 {
9341   Lisp_Object spec, attrs, charset_list;
9342   int c;
9343   struct charset *charset;
9344   unsigned code;
9345
9346   CHECK_CHARACTER (ch);
9347   c = XFASTINT (ch);
9348   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9349   attrs = AREF (spec, 0);
9350
9351   if (ASCII_CHAR_P (c)
9352       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9353     return ch;
9354
9355   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9356   charset = char_charset (c, charset_list, &code);
9357   if (code == CHARSET_INVALID_CODE (charset))
9358     error ("Can't encode by shift_jis encoding: %d", c);
9359   JIS_TO_SJIS (code);
9360
9361   return make_number (code);
9362 }
9363
9364 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9365        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9366 Return the corresponding character.  */)
9367      (code)
9368      Lisp_Object code;
9369 {
9370   Lisp_Object spec, attrs, val;
9371   struct charset *charset_roman, *charset_big5, *charset;
9372   int c;
9373
9374   CHECK_NATNUM (code);
9375   c = XFASTINT (code);
9376   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9377   attrs = AREF (spec, 0);
9378
9379   if (ASCII_BYTE_P (c)
9380       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9381     return code;
9382
9383   val = CODING_ATTR_CHARSET_LIST (attrs);
9384   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9385   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9386
9387   if (c <= 0x7F)
9388     charset = charset_roman;
9389   else
9390     {
9391       int b1 = c >> 8, b2 = c & 0x7F;
9392       if (b1 < 0xA1 || b1 > 0xFE
9393           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9394         error ("Invalid code: %d", code);
9395       charset = charset_big5;
9396     }
9397   c = DECODE_CHAR (charset, (unsigned )c);
9398   if (c < 0)
9399     error ("Invalid code: %d", code);
9400   return make_number (c);
9401 }
9402
9403 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9404        doc: /* Encode the Big5 character CH to BIG5 coding system.
9405 Return the corresponding character code in Big5.  */)
9406      (ch)
9407      Lisp_Object ch;
9408 {
9409   Lisp_Object spec, attrs, charset_list;
9410   struct charset *charset;
9411   int c;
9412   unsigned code;
9413
9414   CHECK_CHARACTER (ch);
9415   c = XFASTINT (ch);
9416   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9417   attrs = AREF (spec, 0);
9418   if (ASCII_CHAR_P (c)
9419       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9420     return ch;
9421
9422   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9423   charset = char_charset (c, charset_list, &code);
9424   if (code == CHARSET_INVALID_CODE (charset))
9425     error ("Can't encode by Big5 encoding: %d", c);
9426
9427   return make_number (code);
9428 }
9429
9430 \f
9431 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9432        Sset_terminal_coding_system_internal, 1, 2, 0,
9433        doc: /* Internal use only.  */)
9434      (coding_system, terminal)
9435      Lisp_Object coding_system;
9436      Lisp_Object terminal;
9437 {
9438   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9439   CHECK_SYMBOL (coding_system);
9440   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9441   /* We had better not send unsafe characters to terminal.  */
9442   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9443   /* Characer composition should be disabled.  */
9444   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9445   terminal_coding->src_multibyte = 1;
9446   terminal_coding->dst_multibyte = 0;
9447   return Qnil;
9448 }
9449
9450 DEFUN ("set-safe-terminal-coding-system-internal",
9451        Fset_safe_terminal_coding_system_internal,
9452        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9453        doc: /* Internal use only.  */)
9454      (coding_system)
9455      Lisp_Object coding_system;
9456 {
9457   CHECK_SYMBOL (coding_system);
9458   setup_coding_system (Fcheck_coding_system (coding_system),
9459                        &safe_terminal_coding);
9460   /* Characer composition should be disabled.  */
9461   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9462   safe_terminal_coding.src_multibyte = 1;
9463   safe_terminal_coding.dst_multibyte = 0;
9464   return Qnil;
9465 }
9466
9467 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9468        Sterminal_coding_system, 0, 1, 0,
9469        doc: /* Return coding system specified for terminal output on the given terminal.
9470 TERMINAL may be a terminal object, a frame, or nil for the selected
9471 frame's terminal device.  */)
9472      (terminal)
9473      Lisp_Object terminal;
9474 {
9475   struct coding_system *terminal_coding
9476     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9477   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9478
9479   /* For backward compatibility, return nil if it is `undecided'. */
9480   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9481 }
9482
9483 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9484        Sset_keyboard_coding_system_internal, 1, 2, 0,
9485        doc: /* Internal use only.  */)
9486      (coding_system, terminal)
9487      Lisp_Object coding_system;
9488      Lisp_Object terminal;
9489 {
9490   struct terminal *t = get_terminal (terminal, 1);
9491   CHECK_SYMBOL (coding_system);
9492   if (NILP (coding_system))
9493     coding_system = Qno_conversion;
9494   else
9495     Fcheck_coding_system (coding_system);
9496   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9497   /* Characer composition should be disabled.  */
9498   TERMINAL_KEYBOARD_CODING (t)->common_flags
9499     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9500   return Qnil;
9501 }
9502
9503 DEFUN ("keyboard-coding-system",
9504        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9505        doc: /* Return coding system specified for decoding keyboard input.  */)
9506      (terminal)
9507      Lisp_Object terminal;
9508 {
9509   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9510                          (get_terminal (terminal, 1))->id);
9511 }
9512
9513 \f
9514 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9515        Sfind_operation_coding_system,  1, MANY, 0,
9516        doc: /* Choose a coding system for an operation based on the target name.
9517 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9518 DECODING-SYSTEM is the coding system to use for decoding
9519 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9520 for encoding (in case OPERATION does encoding).
9521
9522 The first argument OPERATION specifies an I/O primitive:
9523   For file I/O, `insert-file-contents' or `write-region'.
9524   For process I/O, `call-process', `call-process-region', or `start-process'.
9525   For network I/O, `open-network-stream'.
9526
9527 The remaining arguments should be the same arguments that were passed
9528 to the primitive.  Depending on which primitive, one of those arguments
9529 is selected as the TARGET.  For example, if OPERATION does file I/O,
9530 whichever argument specifies the file name is TARGET.
9531
9532 TARGET has a meaning which depends on OPERATION:
9533   For file I/O, TARGET is a file name (except for the special case below).
9534   For process I/O, TARGET is a process name.
9535   For network I/O, TARGET is a service name or a port number.
9536
9537 This function looks up what is specified for TARGET in
9538 `file-coding-system-alist', `process-coding-system-alist',
9539 or `network-coding-system-alist' depending on OPERATION.
9540 They may specify a coding system, a cons of coding systems,
9541 or a function symbol to call.
9542 In the last case, we call the function with one argument,
9543 which is a list of all the arguments given to this function.
9544 If the function can't decide a coding system, it can return
9545 `undecided' so that the normal code-detection is performed.
9546
9547 If OPERATION is `insert-file-contents', the argument corresponding to
9548 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9549 file name to look up, and BUFFER is a buffer that contains the file's
9550 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9551 function to call for FILENAME, that function should examine the
9552 contents of BUFFER instead of reading the file.
9553
9554 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9555      (nargs, args)
9556      int nargs;
9557      Lisp_Object *args;
9558 {
9559   Lisp_Object operation, target_idx, target, val;
9560   register Lisp_Object chain;
9561
9562   if (nargs < 2)
9563     error ("Too few arguments");
9564   operation = args[0];
9565   if (!SYMBOLP (operation)
9566       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9567     error ("Invalid first argument");
9568   if (nargs < 1 + XINT (target_idx))
9569     error ("Too few arguments for operation: %s",
9570            SDATA (SYMBOL_NAME (operation)));
9571   target = args[XINT (target_idx) + 1];
9572   if (!(STRINGP (target)
9573         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9574             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9575         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9576     error ("Invalid %dth argument", XINT (target_idx) + 1);
9577   if (CONSP (target))
9578     target = XCAR (target);
9579
9580   chain = ((EQ (operation, Qinsert_file_contents)
9581             || EQ (operation, Qwrite_region))
9582            ? Vfile_coding_system_alist
9583            : (EQ (operation, Qopen_network_stream)
9584               ? Vnetwork_coding_system_alist
9585               : Vprocess_coding_system_alist));
9586   if (NILP (chain))
9587     return Qnil;
9588
9589   for (; CONSP (chain); chain = XCDR (chain))
9590     {
9591       Lisp_Object elt;
9592
9593       elt = XCAR (chain);
9594       if (CONSP (elt)
9595           && ((STRINGP (target)
9596                && STRINGP (XCAR (elt))
9597                && fast_string_match (XCAR (elt), target) >= 0)
9598               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9599         {
9600           val = XCDR (elt);
9601           /* Here, if VAL is both a valid coding system and a valid
9602              function symbol, we return VAL as a coding system.  */
9603           if (CONSP (val))
9604             return val;
9605           if (! SYMBOLP (val))
9606             return Qnil;
9607           if (! NILP (Fcoding_system_p (val)))
9608             return Fcons (val, val);
9609           if (! NILP (Ffboundp (val)))
9610             {
9611               /* We use call1 rather than safe_call1
9612                  so as to get bug reports about functions called here
9613                  which don't handle the current interface.  */
9614               val = call1 (val, Flist (nargs, args));
9615               if (CONSP (val))
9616                 return val;
9617               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9618                 return Fcons (val, val);
9619             }
9620           return Qnil;
9621         }
9622     }
9623   return Qnil;
9624 }
9625
9626 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9627        Sset_coding_system_priority, 0, MANY, 0,
9628        doc: /* Assign higher priority to the coding systems given as arguments.
9629 If multiple coding systems belong to the same category,
9630 all but the first one are ignored.
9631
9632 usage: (set-coding-system-priority &rest coding-systems)  */)
9633      (nargs, args)
9634      int nargs;
9635      Lisp_Object *args;
9636 {
9637   int i, j;
9638   int changed[coding_category_max];
9639   enum coding_category priorities[coding_category_max];
9640
9641   bzero (changed, sizeof changed);
9642
9643   for (i = j = 0; i < nargs; i++)
9644     {
9645       enum coding_category category;
9646       Lisp_Object spec, attrs;
9647
9648       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9649       attrs = AREF (spec, 0);
9650       category = XINT (CODING_ATTR_CATEGORY (attrs));
9651       if (changed[category])
9652         /* Ignore this coding system because a coding system of the
9653            same category already had a higher priority.  */
9654         continue;
9655       changed[category] = 1;
9656       priorities[j++] = category;
9657       if (coding_categories[category].id >= 0
9658           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9659         setup_coding_system (args[i], &coding_categories[category]);
9660       Fset (AREF (Vcoding_category_table, category), args[i]);
9661     }
9662
9663   /* Now we have decided top J priorities.  Reflect the order of the
9664      original priorities to the remaining priorities.  */
9665
9666   for (i = j, j = 0; i < coding_category_max; i++, j++)
9667     {
9668       while (j < coding_category_max
9669              && changed[coding_priorities[j]])
9670         j++;
9671       if (j == coding_category_max)
9672         abort ();
9673       priorities[i] = coding_priorities[j];
9674     }
9675
9676   bcopy (priorities, coding_priorities, sizeof priorities);
9677
9678   /* Update `coding-category-list'.  */
9679   Vcoding_category_list = Qnil;
9680   for (i = coding_category_max - 1; i >= 0; i--)
9681     Vcoding_category_list
9682       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9683                Vcoding_category_list);
9684
9685   return Qnil;
9686 }
9687
9688 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9689        Scoding_system_priority_list, 0, 1, 0,
9690        doc: /* Return a list of coding systems ordered by their priorities.
9691 The list contains a subset of coding systems; i.e. coding systems
9692 assigned to each coding category (see `coding-category-list').
9693
9694 HIGHESTP non-nil means just return the highest priority one.  */)
9695      (highestp)
9696      Lisp_Object highestp;
9697 {
9698   int i;
9699   Lisp_Object val;
9700
9701   for (i = 0, val = Qnil; i < coding_category_max; i++)
9702     {
9703       enum coding_category category = coding_priorities[i];
9704       int id = coding_categories[category].id;
9705       Lisp_Object attrs;
9706
9707       if (id < 0)
9708         continue;
9709       attrs = CODING_ID_ATTRS (id);
9710       if (! NILP (highestp))
9711         return CODING_ATTR_BASE_NAME (attrs);
9712       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9713     }
9714   return Fnreverse (val);
9715 }
9716
9717 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9718
9719 static Lisp_Object
9720 make_subsidiaries (base)
9721      Lisp_Object base;
9722 {
9723   Lisp_Object subsidiaries;
9724   int base_name_len = SBYTES (SYMBOL_NAME (base));
9725   char *buf = (char *) alloca (base_name_len + 6);
9726   int i;
9727
9728   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9729   subsidiaries = Fmake_vector (make_number (3), Qnil);
9730   for (i = 0; i < 3; i++)
9731     {
9732       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9733       ASET (subsidiaries, i, intern (buf));
9734     }
9735   return subsidiaries;
9736 }
9737
9738
9739 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9740        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9741        doc: /* For internal use only.
9742 usage: (define-coding-system-internal ...)  */)
9743      (nargs, args)
9744      int nargs;
9745      Lisp_Object *args;
9746 {
9747   Lisp_Object name;
9748   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9749   Lisp_Object attrs;            /* Vector of attributes.  */
9750   Lisp_Object eol_type;
9751   Lisp_Object aliases;
9752   Lisp_Object coding_type, charset_list, safe_charsets;
9753   enum coding_category category;
9754   Lisp_Object tail, val;
9755   int max_charset_id = 0;
9756   int i;
9757
9758   if (nargs < coding_arg_max)
9759     goto short_args;
9760
9761   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9762
9763   name = args[coding_arg_name];
9764   CHECK_SYMBOL (name);
9765   CODING_ATTR_BASE_NAME (attrs) = name;
9766
9767   val = args[coding_arg_mnemonic];
9768   if (! STRINGP (val))
9769     CHECK_CHARACTER (val);
9770   CODING_ATTR_MNEMONIC (attrs) = val;
9771
9772   coding_type = args[coding_arg_coding_type];
9773   CHECK_SYMBOL (coding_type);
9774   CODING_ATTR_TYPE (attrs) = coding_type;
9775
9776   charset_list = args[coding_arg_charset_list];
9777   if (SYMBOLP (charset_list))
9778     {
9779       if (EQ (charset_list, Qiso_2022))
9780         {
9781           if (! EQ (coding_type, Qiso_2022))
9782             error ("Invalid charset-list");
9783           charset_list = Viso_2022_charset_list;
9784         }
9785       else if (EQ (charset_list, Qemacs_mule))
9786         {
9787           if (! EQ (coding_type, Qemacs_mule))
9788             error ("Invalid charset-list");
9789           charset_list = Vemacs_mule_charset_list;
9790         }
9791       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9792         if (max_charset_id < XFASTINT (XCAR (tail)))
9793           max_charset_id = XFASTINT (XCAR (tail));
9794     }
9795   else
9796     {
9797       charset_list = Fcopy_sequence (charset_list);
9798       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9799         {
9800           struct charset *charset;
9801
9802           val = XCAR (tail);
9803           CHECK_CHARSET_GET_CHARSET (val, charset);
9804           if (EQ (coding_type, Qiso_2022)
9805               ? CHARSET_ISO_FINAL (charset) < 0
9806               : EQ (coding_type, Qemacs_mule)
9807               ? CHARSET_EMACS_MULE_ID (charset) < 0
9808               : 0)
9809             error ("Can't handle charset `%s'",
9810                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9811
9812           XSETCAR (tail, make_number (charset->id));
9813           if (max_charset_id < charset->id)
9814             max_charset_id = charset->id;
9815         }
9816     }
9817   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9818
9819   safe_charsets = make_uninit_string (max_charset_id + 1);
9820   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9821   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9822     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9823   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9824
9825   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9826
9827   val = args[coding_arg_decode_translation_table];
9828   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9829     CHECK_SYMBOL (val);
9830   CODING_ATTR_DECODE_TBL (attrs) = val;
9831
9832   val = args[coding_arg_encode_translation_table];
9833   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9834     CHECK_SYMBOL (val);
9835   CODING_ATTR_ENCODE_TBL (attrs) = val;
9836
9837   val = args[coding_arg_post_read_conversion];
9838   CHECK_SYMBOL (val);
9839   CODING_ATTR_POST_READ (attrs) = val;
9840
9841   val = args[coding_arg_pre_write_conversion];
9842   CHECK_SYMBOL (val);
9843   CODING_ATTR_PRE_WRITE (attrs) = val;
9844
9845   val = args[coding_arg_default_char];
9846   if (NILP (val))
9847     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9848   else
9849     {
9850       CHECK_CHARACTER (val);
9851       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9852     }
9853
9854   val = args[coding_arg_for_unibyte];
9855   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9856
9857   val = args[coding_arg_plist];
9858   CHECK_LIST (val);
9859   CODING_ATTR_PLIST (attrs) = val;
9860
9861   if (EQ (coding_type, Qcharset))
9862     {
9863       /* Generate a lisp vector of 256 elements.  Each element is nil,
9864          integer, or a list of charset IDs.
9865
9866          If Nth element is nil, the byte code N is invalid in this
9867          coding system.
9868
9869          If Nth element is a number NUM, N is the first byte of a
9870          charset whose ID is NUM.
9871
9872          If Nth element is a list of charset IDs, N is the first byte
9873          of one of them.  The list is sorted by dimensions of the
9874          charsets.  A charset of smaller dimension comes firtst. */
9875       val = Fmake_vector (make_number (256), Qnil);
9876
9877       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9878         {
9879           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9880           int dim = CHARSET_DIMENSION (charset);
9881           int idx = (dim - 1) * 4;
9882
9883           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9884             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9885
9886           for (i = charset->code_space[idx];
9887                i <= charset->code_space[idx + 1]; i++)
9888             {
9889               Lisp_Object tmp, tmp2;
9890               int dim2;
9891
9892               tmp = AREF (val, i);
9893               if (NILP (tmp))
9894                 tmp = XCAR (tail);
9895               else if (NUMBERP (tmp))
9896                 {
9897                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9898                   if (dim < dim2)
9899                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9900                   else
9901                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9902                 }
9903               else
9904                 {
9905                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9906                     {
9907                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9908                       if (dim < dim2)
9909                         break;
9910                     }
9911                   if (NILP (tmp2))
9912                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9913                   else
9914                     {
9915                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9916                       XSETCAR (tmp2, XCAR (tail));
9917                     }
9918                 }
9919               ASET (val, i, tmp);
9920             }
9921         }
9922       ASET (attrs, coding_attr_charset_valids, val);
9923       category = coding_category_charset;
9924     }
9925   else if (EQ (coding_type, Qccl))
9926     {
9927       Lisp_Object valids;
9928
9929       if (nargs < coding_arg_ccl_max)
9930         goto short_args;
9931
9932       val = args[coding_arg_ccl_decoder];
9933       CHECK_CCL_PROGRAM (val);
9934       if (VECTORP (val))
9935         val = Fcopy_sequence (val);
9936       ASET (attrs, coding_attr_ccl_decoder, val);
9937
9938       val = args[coding_arg_ccl_encoder];
9939       CHECK_CCL_PROGRAM (val);
9940       if (VECTORP (val))
9941         val = Fcopy_sequence (val);
9942       ASET (attrs, coding_attr_ccl_encoder, val);
9943
9944       val = args[coding_arg_ccl_valids];
9945       valids = Fmake_string (make_number (256), make_number (0));
9946       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9947         {
9948           int from, to;
9949
9950           val = Fcar (tail);
9951           if (INTEGERP (val))
9952             {
9953               from = to = XINT (val);
9954               if (from < 0 || from > 255)
9955                 args_out_of_range_3 (val, make_number (0), make_number (255));
9956             }
9957           else
9958             {
9959               CHECK_CONS (val);
9960               CHECK_NATNUM_CAR (val);
9961               CHECK_NATNUM_CDR (val);
9962               from = XINT (XCAR (val));
9963               if (from > 255)
9964                 args_out_of_range_3 (XCAR (val),
9965                                      make_number (0), make_number (255));
9966               to = XINT (XCDR (val));
9967               if (to < from || to > 255)
9968                 args_out_of_range_3 (XCDR (val),
9969                                      XCAR (val), make_number (255));
9970             }
9971           for (i = from; i <= to; i++)
9972             SSET (valids, i, 1);
9973         }
9974       ASET (attrs, coding_attr_ccl_valids, valids);
9975
9976       category = coding_category_ccl;
9977     }
9978   else if (EQ (coding_type, Qutf_16))
9979     {
9980       Lisp_Object bom, endian;
9981
9982       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9983
9984       if (nargs < coding_arg_utf16_max)
9985         goto short_args;
9986
9987       bom = args[coding_arg_utf16_bom];
9988       if (! NILP (bom) && ! EQ (bom, Qt))
9989         {
9990           CHECK_CONS (bom);
9991           val = XCAR (bom);
9992           CHECK_CODING_SYSTEM (val);
9993           val = XCDR (bom);
9994           CHECK_CODING_SYSTEM (val);
9995         }
9996       ASET (attrs, coding_attr_utf_bom, bom);
9997
9998       endian = args[coding_arg_utf16_endian];
9999       CHECK_SYMBOL (endian);
10000       if (NILP (endian))
10001         endian = Qbig;
10002       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10003         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10004       ASET (attrs, coding_attr_utf_16_endian, endian);
10005
10006       category = (CONSP (bom)
10007                   ? coding_category_utf_16_auto
10008                   : NILP (bom)
10009                   ? (EQ (endian, Qbig)
10010                      ? coding_category_utf_16_be_nosig
10011                      : coding_category_utf_16_le_nosig)
10012                   : (EQ (endian, Qbig)
10013                      ? coding_category_utf_16_be
10014                      : coding_category_utf_16_le));
10015     }
10016   else if (EQ (coding_type, Qiso_2022))
10017     {
10018       Lisp_Object initial, reg_usage, request, flags;
10019       int i;
10020
10021       if (nargs < coding_arg_iso2022_max)
10022         goto short_args;
10023
10024       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10025       CHECK_VECTOR (initial);
10026       for (i = 0; i < 4; i++)
10027         {
10028           val = Faref (initial, make_number (i));
10029           if (! NILP (val))
10030             {
10031               struct charset *charset;
10032
10033               CHECK_CHARSET_GET_CHARSET (val, charset);
10034               ASET (initial, i, make_number (CHARSET_ID (charset)));
10035               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10036                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10037             }
10038           else
10039             ASET (initial, i, make_number (-1));
10040         }
10041
10042       reg_usage = args[coding_arg_iso2022_reg_usage];
10043       CHECK_CONS (reg_usage);
10044       CHECK_NUMBER_CAR (reg_usage);
10045       CHECK_NUMBER_CDR (reg_usage);
10046
10047       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10048       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
10049         {
10050           int id;
10051           Lisp_Object tmp;
10052
10053           val = Fcar (tail);
10054           CHECK_CONS (val);
10055           tmp = XCAR (val);
10056           CHECK_CHARSET_GET_ID (tmp, id);
10057           CHECK_NATNUM_CDR (val);
10058           if (XINT (XCDR (val)) >= 4)
10059             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
10060           XSETCAR (val, make_number (id));
10061         }
10062
10063       flags = args[coding_arg_iso2022_flags];
10064       CHECK_NATNUM (flags);
10065       i = XINT (flags);
10066       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10067         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
10068
10069       ASET (attrs, coding_attr_iso_initial, initial);
10070       ASET (attrs, coding_attr_iso_usage, reg_usage);
10071       ASET (attrs, coding_attr_iso_request, request);
10072       ASET (attrs, coding_attr_iso_flags, flags);
10073       setup_iso_safe_charsets (attrs);
10074
10075       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10076         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10077                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10078                     ? coding_category_iso_7_else
10079                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10080                     ? coding_category_iso_7
10081                     : coding_category_iso_7_tight);
10082       else
10083         {
10084           int id = XINT (AREF (initial, 1));
10085
10086           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10087                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10088                        || id < 0)
10089                       ? coding_category_iso_8_else
10090                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10091                       ? coding_category_iso_8_1
10092                       : coding_category_iso_8_2);
10093         }
10094       if (category != coding_category_iso_8_1
10095           && category != coding_category_iso_8_2)
10096         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10097     }
10098   else if (EQ (coding_type, Qemacs_mule))
10099     {
10100       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10101         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10102       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10103       category = coding_category_emacs_mule;
10104     }
10105   else if (EQ (coding_type, Qshift_jis))
10106     {
10107
10108       struct charset *charset;
10109
10110       if (XINT (Flength (charset_list)) != 3
10111           && XINT (Flength (charset_list)) != 4)
10112         error ("There should be three or four charsets");
10113
10114       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10115       if (CHARSET_DIMENSION (charset) != 1)
10116         error ("Dimension of charset %s is not one",
10117                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10118       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10119         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10120
10121       charset_list = XCDR (charset_list);
10122       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10123       if (CHARSET_DIMENSION (charset) != 1)
10124         error ("Dimension of charset %s is not one",
10125                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10126
10127       charset_list = XCDR (charset_list);
10128       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10129       if (CHARSET_DIMENSION (charset) != 2)
10130         error ("Dimension of charset %s is not two",
10131                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10132
10133       charset_list = XCDR (charset_list);
10134       if (! NILP (charset_list))
10135         {
10136           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10137           if (CHARSET_DIMENSION (charset) != 2)
10138             error ("Dimension of charset %s is not two",
10139                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10140         }
10141
10142       category = coding_category_sjis;
10143       Vsjis_coding_system = name;
10144     }
10145   else if (EQ (coding_type, Qbig5))
10146     {
10147       struct charset *charset;
10148
10149       if (XINT (Flength (charset_list)) != 2)
10150         error ("There should be just two charsets");
10151
10152       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10153       if (CHARSET_DIMENSION (charset) != 1)
10154         error ("Dimension of charset %s is not one",
10155                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10156       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10157         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10158
10159       charset_list = XCDR (charset_list);
10160       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10161       if (CHARSET_DIMENSION (charset) != 2)
10162         error ("Dimension of charset %s is not two",
10163                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10164
10165       category = coding_category_big5;
10166       Vbig5_coding_system = name;
10167     }
10168   else if (EQ (coding_type, Qraw_text))
10169     {
10170       category = coding_category_raw_text;
10171       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10172     }
10173   else if (EQ (coding_type, Qutf_8))
10174     {
10175       Lisp_Object bom;
10176
10177       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10178
10179       if (nargs < coding_arg_utf8_max)
10180         goto short_args;
10181
10182       bom = args[coding_arg_utf8_bom];
10183       if (! NILP (bom) && ! EQ (bom, Qt))
10184         {
10185           CHECK_CONS (bom);
10186           val = XCAR (bom);
10187           CHECK_CODING_SYSTEM (val);
10188           val = XCDR (bom);
10189           CHECK_CODING_SYSTEM (val);
10190         }
10191       ASET (attrs, coding_attr_utf_bom, bom);
10192
10193       category = (CONSP (bom) ? coding_category_utf_8_auto
10194                   : NILP (bom) ? coding_category_utf_8_nosig
10195                   : coding_category_utf_8_sig);
10196     }
10197   else if (EQ (coding_type, Qundecided))
10198     category = coding_category_undecided;
10199   else
10200     error ("Invalid coding system type: %s",
10201            SDATA (SYMBOL_NAME (coding_type)));
10202
10203   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10204   CODING_ATTR_PLIST (attrs)
10205     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10206                                 CODING_ATTR_PLIST (attrs)));
10207   CODING_ATTR_PLIST (attrs)
10208     = Fcons (QCascii_compatible_p,
10209              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10210                     CODING_ATTR_PLIST (attrs)));
10211
10212   eol_type = args[coding_arg_eol_type];
10213   if (! NILP (eol_type)
10214       && ! EQ (eol_type, Qunix)
10215       && ! EQ (eol_type, Qdos)
10216       && ! EQ (eol_type, Qmac))
10217     error ("Invalid eol-type");
10218
10219   aliases = Fcons (name, Qnil);
10220
10221   if (NILP (eol_type))
10222     {
10223       eol_type = make_subsidiaries (name);
10224       for (i = 0; i < 3; i++)
10225         {
10226           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10227
10228           this_name = AREF (eol_type, i);
10229           this_aliases = Fcons (this_name, Qnil);
10230           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10231           this_spec = Fmake_vector (make_number (3), attrs);
10232           ASET (this_spec, 1, this_aliases);
10233           ASET (this_spec, 2, this_eol_type);
10234           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10235           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10236           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10237           if (NILP (val))
10238             Vcoding_system_alist
10239               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10240                        Vcoding_system_alist);
10241         }
10242     }
10243
10244   spec_vec = Fmake_vector (make_number (3), attrs);
10245   ASET (spec_vec, 1, aliases);
10246   ASET (spec_vec, 2, eol_type);
10247
10248   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10249   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10250   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10251   if (NILP (val))
10252     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10253                                   Vcoding_system_alist);
10254
10255   {
10256     int id = coding_categories[category].id;
10257
10258     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10259       setup_coding_system (name, &coding_categories[category]);
10260   }
10261
10262   return Qnil;
10263
10264  short_args:
10265   return Fsignal (Qwrong_number_of_arguments,
10266                   Fcons (intern ("define-coding-system-internal"),
10267                          make_number (nargs)));
10268 }
10269
10270
10271 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10272        3, 3, 0,
10273        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10274   (coding_system, prop, val)
10275      Lisp_Object coding_system, prop, val;
10276 {
10277   Lisp_Object spec, attrs;
10278
10279   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10280   attrs = AREF (spec, 0);
10281   if (EQ (prop, QCmnemonic))
10282     {
10283       if (! STRINGP (val))
10284         CHECK_CHARACTER (val);
10285       CODING_ATTR_MNEMONIC (attrs) = val;
10286     }
10287   else if (EQ (prop, QCdefault_char))
10288     {
10289       if (NILP (val))
10290         val = make_number (' ');
10291       else
10292         CHECK_CHARACTER (val);
10293       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10294     }
10295   else if (EQ (prop, QCdecode_translation_table))
10296     {
10297       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10298         CHECK_SYMBOL (val);
10299       CODING_ATTR_DECODE_TBL (attrs) = val;
10300     }
10301   else if (EQ (prop, QCencode_translation_table))
10302     {
10303       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10304         CHECK_SYMBOL (val);
10305       CODING_ATTR_ENCODE_TBL (attrs) = val;
10306     }
10307   else if (EQ (prop, QCpost_read_conversion))
10308     {
10309       CHECK_SYMBOL (val);
10310       CODING_ATTR_POST_READ (attrs) = val;
10311     }
10312   else if (EQ (prop, QCpre_write_conversion))
10313     {
10314       CHECK_SYMBOL (val);
10315       CODING_ATTR_PRE_WRITE (attrs) = val;
10316     }
10317   else if (EQ (prop, QCascii_compatible_p))
10318     {
10319       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10320     }
10321
10322   CODING_ATTR_PLIST (attrs)
10323     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10324   return val;
10325 }
10326
10327
10328 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10329        Sdefine_coding_system_alias, 2, 2, 0,
10330        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10331      (alias, coding_system)
10332      Lisp_Object alias, coding_system;
10333 {
10334   Lisp_Object spec, aliases, eol_type, val;
10335
10336   CHECK_SYMBOL (alias);
10337   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10338   aliases = AREF (spec, 1);
10339   /* ALIASES should be a list of length more than zero, and the first
10340      element is a base coding system.  Append ALIAS at the tail of the
10341      list.  */
10342   while (!NILP (XCDR (aliases)))
10343     aliases = XCDR (aliases);
10344   XSETCDR (aliases, Fcons (alias, Qnil));
10345
10346   eol_type = AREF (spec, 2);
10347   if (VECTORP (eol_type))
10348     {
10349       Lisp_Object subsidiaries;
10350       int i;
10351
10352       subsidiaries = make_subsidiaries (alias);
10353       for (i = 0; i < 3; i++)
10354         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10355                                      AREF (eol_type, i));
10356     }
10357
10358   Fputhash (alias, spec, Vcoding_system_hash_table);
10359   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10360   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10361   if (NILP (val))
10362     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10363                                   Vcoding_system_alist);
10364
10365   return Qnil;
10366 }
10367
10368 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10369        1, 1, 0,
10370        doc: /* Return the base of CODING-SYSTEM.
10371 Any alias or subsidiary coding system is not a base coding system.  */)
10372   (coding_system)
10373      Lisp_Object coding_system;
10374 {
10375   Lisp_Object spec, attrs;
10376
10377   if (NILP (coding_system))
10378     return (Qno_conversion);
10379   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10380   attrs = AREF (spec, 0);
10381   return CODING_ATTR_BASE_NAME (attrs);
10382 }
10383
10384 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10385        1, 1, 0,
10386        doc: "Return the property list of CODING-SYSTEM.")
10387      (coding_system)
10388      Lisp_Object coding_system;
10389 {
10390   Lisp_Object spec, attrs;
10391
10392   if (NILP (coding_system))
10393     coding_system = Qno_conversion;
10394   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10395   attrs = AREF (spec, 0);
10396   return CODING_ATTR_PLIST (attrs);
10397 }
10398
10399
10400 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10401        1, 1, 0,
10402        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10403      (coding_system)
10404      Lisp_Object coding_system;
10405 {
10406   Lisp_Object spec;
10407
10408   if (NILP (coding_system))
10409     coding_system = Qno_conversion;
10410   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10411   return AREF (spec, 1);
10412 }
10413
10414 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10415        Scoding_system_eol_type, 1, 1, 0,
10416        doc: /* Return eol-type of CODING-SYSTEM.
10417 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10418
10419 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10420 and CR respectively.
10421
10422 A vector value indicates that a format of end-of-line should be
10423 detected automatically.  Nth element of the vector is the subsidiary
10424 coding system whose eol-type is N.  */)
10425      (coding_system)
10426      Lisp_Object coding_system;
10427 {
10428   Lisp_Object spec, eol_type;
10429   int n;
10430
10431   if (NILP (coding_system))
10432     coding_system = Qno_conversion;
10433   if (! CODING_SYSTEM_P (coding_system))
10434     return Qnil;
10435   spec = CODING_SYSTEM_SPEC (coding_system);
10436   eol_type = AREF (spec, 2);
10437   if (VECTORP (eol_type))
10438     return Fcopy_sequence (eol_type);
10439   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10440   return make_number (n);
10441 }
10442
10443 #endif /* emacs */
10444
10445 \f
10446 /*** 9. Post-amble ***/
10447
10448 void
10449 init_coding_once ()
10450 {
10451   int i;
10452
10453   for (i = 0; i < coding_category_max; i++)
10454     {
10455       coding_categories[i].id = -1;
10456       coding_priorities[i] = i;
10457     }
10458
10459   /* ISO2022 specific initialize routine.  */
10460   for (i = 0; i < 0x20; i++)
10461     iso_code_class[i] = ISO_control_0;
10462   for (i = 0x21; i < 0x7F; i++)
10463     iso_code_class[i] = ISO_graphic_plane_0;
10464   for (i = 0x80; i < 0xA0; i++)
10465     iso_code_class[i] = ISO_control_1;
10466   for (i = 0xA1; i < 0xFF; i++)
10467     iso_code_class[i] = ISO_graphic_plane_1;
10468   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10469   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10470   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10471   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10472   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10473   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10474   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10475   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10476   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10477
10478   for (i = 0; i < 256; i++)
10479     {
10480       emacs_mule_bytes[i] = 1;
10481     }
10482   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10483   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10484   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10485   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10486 }
10487
10488 #ifdef emacs
10489
10490 void
10491 syms_of_coding ()
10492 {
10493   staticpro (&Vcoding_system_hash_table);
10494   {
10495     Lisp_Object args[2];
10496     args[0] = QCtest;
10497     args[1] = Qeq;
10498     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10499   }
10500
10501   staticpro (&Vsjis_coding_system);
10502   Vsjis_coding_system = Qnil;
10503
10504   staticpro (&Vbig5_coding_system);
10505   Vbig5_coding_system = Qnil;
10506
10507   staticpro (&Vcode_conversion_reused_workbuf);
10508   Vcode_conversion_reused_workbuf = Qnil;
10509
10510   staticpro (&Vcode_conversion_workbuf_name);
10511   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10512
10513   reused_workbuf_in_use = 0;
10514
10515   DEFSYM (Qcharset, "charset");
10516   DEFSYM (Qtarget_idx, "target-idx");
10517   DEFSYM (Qcoding_system_history, "coding-system-history");
10518   Fset (Qcoding_system_history, Qnil);
10519
10520   /* Target FILENAME is the first argument.  */
10521   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10522   /* Target FILENAME is the third argument.  */
10523   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10524
10525   DEFSYM (Qcall_process, "call-process");
10526   /* Target PROGRAM is the first argument.  */
10527   Fput (Qcall_process, Qtarget_idx, make_number (0));
10528
10529   DEFSYM (Qcall_process_region, "call-process-region");
10530   /* Target PROGRAM is the third argument.  */
10531   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10532
10533   DEFSYM (Qstart_process, "start-process");
10534   /* Target PROGRAM is the third argument.  */
10535   Fput (Qstart_process, Qtarget_idx, make_number (2));
10536
10537   DEFSYM (Qopen_network_stream, "open-network-stream");
10538   /* Target SERVICE is the fourth argument.  */
10539   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10540
10541   DEFSYM (Qcoding_system, "coding-system");
10542   DEFSYM (Qcoding_aliases, "coding-aliases");
10543
10544   DEFSYM (Qeol_type, "eol-type");
10545   DEFSYM (Qunix, "unix");
10546   DEFSYM (Qdos, "dos");
10547
10548   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10549   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10550   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10551   DEFSYM (Qdefault_char, "default-char");
10552   DEFSYM (Qundecided, "undecided");
10553   DEFSYM (Qno_conversion, "no-conversion");
10554   DEFSYM (Qraw_text, "raw-text");
10555
10556   DEFSYM (Qiso_2022, "iso-2022");
10557
10558   DEFSYM (Qutf_8, "utf-8");
10559   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10560
10561   DEFSYM (Qutf_16, "utf-16");
10562   DEFSYM (Qbig, "big");
10563   DEFSYM (Qlittle, "little");
10564
10565   DEFSYM (Qshift_jis, "shift-jis");
10566   DEFSYM (Qbig5, "big5");
10567
10568   DEFSYM (Qcoding_system_p, "coding-system-p");
10569
10570   DEFSYM (Qcoding_system_error, "coding-system-error");
10571   Fput (Qcoding_system_error, Qerror_conditions,
10572         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10573   Fput (Qcoding_system_error, Qerror_message,
10574         make_pure_c_string ("Invalid coding system"));
10575
10576   /* Intern this now in case it isn't already done.
10577      Setting this variable twice is harmless.
10578      But don't staticpro it here--that is done in alloc.c.  */
10579   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10580
10581   DEFSYM (Qtranslation_table, "translation-table");
10582   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10583   DEFSYM (Qtranslation_table_id, "translation-table-id");
10584   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10585   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10586
10587   DEFSYM (Qvalid_codes, "valid-codes");
10588
10589   DEFSYM (Qemacs_mule, "emacs-mule");
10590
10591   DEFSYM (QCcategory, ":category");
10592   DEFSYM (QCmnemonic, ":mnemonic");
10593   DEFSYM (QCdefault_char, ":default-char");
10594   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10595   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10596   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10597   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10598   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10599
10600   Vcoding_category_table
10601     = Fmake_vector (make_number (coding_category_max), Qnil);
10602   staticpro (&Vcoding_category_table);
10603   /* Followings are target of code detection.  */
10604   ASET (Vcoding_category_table, coding_category_iso_7,
10605         intern_c_string ("coding-category-iso-7"));
10606   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10607         intern_c_string ("coding-category-iso-7-tight"));
10608   ASET (Vcoding_category_table, coding_category_iso_8_1,
10609         intern_c_string ("coding-category-iso-8-1"));
10610   ASET (Vcoding_category_table, coding_category_iso_8_2,
10611         intern_c_string ("coding-category-iso-8-2"));
10612   ASET (Vcoding_category_table, coding_category_iso_7_else,
10613         intern_c_string ("coding-category-iso-7-else"));
10614   ASET (Vcoding_category_table, coding_category_iso_8_else,
10615         intern_c_string ("coding-category-iso-8-else"));
10616   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10617         intern_c_string ("coding-category-utf-8-auto"));
10618   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10619         intern_c_string ("coding-category-utf-8"));
10620   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10621         intern_c_string ("coding-category-utf-8-sig"));
10622   ASET (Vcoding_category_table, coding_category_utf_16_be,
10623         intern_c_string ("coding-category-utf-16-be"));
10624   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10625         intern_c_string ("coding-category-utf-16-auto"));
10626   ASET (Vcoding_category_table, coding_category_utf_16_le,
10627         intern_c_string ("coding-category-utf-16-le"));
10628   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10629         intern_c_string ("coding-category-utf-16-be-nosig"));
10630   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10631         intern_c_string ("coding-category-utf-16-le-nosig"));
10632   ASET (Vcoding_category_table, coding_category_charset,
10633         intern_c_string ("coding-category-charset"));
10634   ASET (Vcoding_category_table, coding_category_sjis,
10635         intern_c_string ("coding-category-sjis"));
10636   ASET (Vcoding_category_table, coding_category_big5,
10637         intern_c_string ("coding-category-big5"));
10638   ASET (Vcoding_category_table, coding_category_ccl,
10639         intern_c_string ("coding-category-ccl"));
10640   ASET (Vcoding_category_table, coding_category_emacs_mule,
10641         intern_c_string ("coding-category-emacs-mule"));
10642   /* Followings are NOT target of code detection.  */
10643   ASET (Vcoding_category_table, coding_category_raw_text,
10644         intern_c_string ("coding-category-raw-text"));
10645   ASET (Vcoding_category_table, coding_category_undecided,
10646         intern_c_string ("coding-category-undecided"));
10647
10648   DEFSYM (Qinsufficient_source, "insufficient-source");
10649   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10650   DEFSYM (Qinvalid_source, "invalid-source");
10651   DEFSYM (Qinterrupted, "interrupted");
10652   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10653   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10654
10655   defsubr (&Scoding_system_p);
10656   defsubr (&Sread_coding_system);
10657   defsubr (&Sread_non_nil_coding_system);
10658   defsubr (&Scheck_coding_system);
10659   defsubr (&Sdetect_coding_region);
10660   defsubr (&Sdetect_coding_string);
10661   defsubr (&Sfind_coding_systems_region_internal);
10662   defsubr (&Sunencodable_char_position);
10663   defsubr (&Scheck_coding_systems_region);
10664   defsubr (&Sdecode_coding_region);
10665   defsubr (&Sencode_coding_region);
10666   defsubr (&Sdecode_coding_string);
10667   defsubr (&Sencode_coding_string);
10668   defsubr (&Sdecode_sjis_char);
10669   defsubr (&Sencode_sjis_char);
10670   defsubr (&Sdecode_big5_char);
10671   defsubr (&Sencode_big5_char);
10672   defsubr (&Sset_terminal_coding_system_internal);
10673   defsubr (&Sset_safe_terminal_coding_system_internal);
10674   defsubr (&Sterminal_coding_system);
10675   defsubr (&Sset_keyboard_coding_system_internal);
10676   defsubr (&Skeyboard_coding_system);
10677   defsubr (&Sfind_operation_coding_system);
10678   defsubr (&Sset_coding_system_priority);
10679   defsubr (&Sdefine_coding_system_internal);
10680   defsubr (&Sdefine_coding_system_alias);
10681   defsubr (&Scoding_system_put);
10682   defsubr (&Scoding_system_base);
10683   defsubr (&Scoding_system_plist);
10684   defsubr (&Scoding_system_aliases);
10685   defsubr (&Scoding_system_eol_type);
10686   defsubr (&Scoding_system_priority_list);
10687
10688   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10689                doc: /* List of coding systems.
10690
10691 Do not alter the value of this variable manually.  This variable should be
10692 updated by the functions `define-coding-system' and
10693 `define-coding-system-alias'.  */);
10694   Vcoding_system_list = Qnil;
10695
10696   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10697                doc: /* Alist of coding system names.
10698 Each element is one element list of coding system name.
10699 This variable is given to `completing-read' as COLLECTION argument.
10700
10701 Do not alter the value of this variable manually.  This variable should be
10702 updated by the functions `make-coding-system' and
10703 `define-coding-system-alias'.  */);
10704   Vcoding_system_alist = Qnil;
10705
10706   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10707                doc: /* List of coding-categories (symbols) ordered by priority.
10708
10709 On detecting a coding system, Emacs tries code detection algorithms
10710 associated with each coding-category one by one in this order.  When
10711 one algorithm agrees with a byte sequence of source text, the coding
10712 system bound to the corresponding coding-category is selected.
10713
10714 Don't modify this variable directly, but use `set-coding-priority'.  */);
10715   {
10716     int i;
10717
10718     Vcoding_category_list = Qnil;
10719     for (i = coding_category_max - 1; i >= 0; i--)
10720       Vcoding_category_list
10721         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10722                  Vcoding_category_list);
10723   }
10724
10725   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10726                doc: /* Specify the coding system for read operations.
10727 It is useful to bind this variable with `let', but do not set it globally.
10728 If the value is a coding system, it is used for decoding on read operation.
10729 If not, an appropriate element is used from one of the coding system alists.
10730 There are three such tables: `file-coding-system-alist',
10731 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10732   Vcoding_system_for_read = Qnil;
10733
10734   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10735                doc: /* Specify the coding system for write operations.
10736 Programs bind this variable with `let', but you should not set it globally.
10737 If the value is a coding system, it is used for encoding of output,
10738 when writing it to a file and when sending it to a file or subprocess.
10739
10740 If this does not specify a coding system, an appropriate element
10741 is used from one of the coding system alists.
10742 There are three such tables: `file-coding-system-alist',
10743 `process-coding-system-alist', and `network-coding-system-alist'.
10744 For output to files, if the above procedure does not specify a coding system,
10745 the value of `buffer-file-coding-system' is used.  */);
10746   Vcoding_system_for_write = Qnil;
10747
10748   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10749                doc: /*
10750 Coding system used in the latest file or process I/O.  */);
10751   Vlast_coding_system_used = Qnil;
10752
10753   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10754                doc: /*
10755 Error status of the last code conversion.
10756
10757 When an error was detected in the last code conversion, this variable
10758 is set to one of the following symbols.
10759   `insufficient-source'
10760   `inconsistent-eol'
10761   `invalid-source'
10762   `interrupted'
10763   `insufficient-memory'
10764 When no error was detected, the value doesn't change.  So, to check
10765 the error status of a code conversion by this variable, you must
10766 explicitly set this variable to nil before performing code
10767 conversion.  */);
10768   Vlast_code_conversion_error = Qnil;
10769
10770   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10771                doc: /*
10772 *Non-nil means always inhibit code conversion of end-of-line format.
10773 See info node `Coding Systems' and info node `Text and Binary' concerning
10774 such conversion.  */);
10775   inhibit_eol_conversion = 0;
10776
10777   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10778                doc: /*
10779 Non-nil means process buffer inherits coding system of process output.
10780 Bind it to t if the process output is to be treated as if it were a file
10781 read from some filesystem.  */);
10782   inherit_process_coding_system = 0;
10783
10784   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10785                doc: /*
10786 Alist to decide a coding system to use for a file I/O operation.
10787 The format is ((PATTERN . VAL) ...),
10788 where PATTERN is a regular expression matching a file name,
10789 VAL is a coding system, a cons of coding systems, or a function symbol.
10790 If VAL is a coding system, it is used for both decoding and encoding
10791 the file contents.
10792 If VAL is a cons of coding systems, the car part is used for decoding,
10793 and the cdr part is used for encoding.
10794 If VAL is a function symbol, the function must return a coding system
10795 or a cons of coding systems which are used as above.  The function is
10796 called with an argument that is a list of the arguments with which
10797 `find-operation-coding-system' was called.  If the function can't decide
10798 a coding system, it can return `undecided' so that the normal
10799 code-detection is performed.
10800
10801 See also the function `find-operation-coding-system'
10802 and the variable `auto-coding-alist'.  */);
10803   Vfile_coding_system_alist = Qnil;
10804
10805   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10806                doc: /*
10807 Alist to decide a coding system to use for a process I/O operation.
10808 The format is ((PATTERN . VAL) ...),
10809 where PATTERN is a regular expression matching a program name,
10810 VAL is a coding system, a cons of coding systems, or a function symbol.
10811 If VAL is a coding system, it is used for both decoding what received
10812 from the program and encoding what sent to the program.
10813 If VAL is a cons of coding systems, the car part is used for decoding,
10814 and the cdr part is used for encoding.
10815 If VAL is a function symbol, the function must return a coding system
10816 or a cons of coding systems which are used as above.
10817
10818 See also the function `find-operation-coding-system'.  */);
10819   Vprocess_coding_system_alist = Qnil;
10820
10821   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10822                doc: /*
10823 Alist to decide a coding system to use for a network I/O operation.
10824 The format is ((PATTERN . VAL) ...),
10825 where PATTERN is a regular expression matching a network service name
10826 or is a port number to connect to,
10827 VAL is a coding system, a cons of coding systems, or a function symbol.
10828 If VAL is a coding system, it is used for both decoding what received
10829 from the network stream and encoding what sent to the network stream.
10830 If VAL is a cons of coding systems, the car part is used for decoding,
10831 and the cdr part is used for encoding.
10832 If VAL is a function symbol, the function must return a coding system
10833 or a cons of coding systems which are used as above.
10834
10835 See also the function `find-operation-coding-system'.  */);
10836   Vnetwork_coding_system_alist = Qnil;
10837
10838   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10839                doc: /* Coding system to use with system messages.
10840 Also used for decoding keyboard input on X Window system.  */);
10841   Vlocale_coding_system = Qnil;
10842
10843   /* The eol mnemonics are reset in startup.el system-dependently.  */
10844   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10845                doc: /*
10846 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10847   eol_mnemonic_unix = make_pure_c_string (":");
10848
10849   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10850                doc: /*
10851 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10852   eol_mnemonic_dos = make_pure_c_string ("\\");
10853
10854   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10855                doc: /*
10856 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10857   eol_mnemonic_mac = make_pure_c_string ("/");
10858
10859   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10860                doc: /*
10861 *String displayed in mode line when end-of-line format is not yet determined.  */);
10862   eol_mnemonic_undecided = make_pure_c_string (":");
10863
10864   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10865                doc: /*
10866 *Non-nil enables character translation while encoding and decoding.  */);
10867   Venable_character_translation = Qt;
10868
10869   DEFVAR_LISP ("standard-translation-table-for-decode",
10870                &Vstandard_translation_table_for_decode,
10871                doc: /* Table for translating characters while decoding.  */);
10872   Vstandard_translation_table_for_decode = Qnil;
10873
10874   DEFVAR_LISP ("standard-translation-table-for-encode",
10875                &Vstandard_translation_table_for_encode,
10876                doc: /* Table for translating characters while encoding.  */);
10877   Vstandard_translation_table_for_encode = Qnil;
10878
10879   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10880                doc: /* Alist of charsets vs revision numbers.
10881 While encoding, if a charset (car part of an element) is found,
10882 designate it with the escape sequence identifying revision (cdr part
10883 of the element).  */);
10884   Vcharset_revision_table = Qnil;
10885
10886   DEFVAR_LISP ("default-process-coding-system",
10887                &Vdefault_process_coding_system,
10888                doc: /* Cons of coding systems used for process I/O by default.
10889 The car part is used for decoding a process output,
10890 the cdr part is used for encoding a text to be sent to a process.  */);
10891   Vdefault_process_coding_system = Qnil;
10892
10893   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10894                doc: /*
10895 Table of extra Latin codes in the range 128..159 (inclusive).
10896 This is a vector of length 256.
10897 If Nth element is non-nil, the existence of code N in a file
10898 \(or output of subprocess) doesn't prevent it to be detected as
10899 a coding system of ISO 2022 variant which has a flag
10900 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10901 or reading output of a subprocess.
10902 Only 128th through 159th elements have a meaning.  */);
10903   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10904
10905   DEFVAR_LISP ("select-safe-coding-system-function",
10906                &Vselect_safe_coding_system_function,
10907                doc: /*
10908 Function to call to select safe coding system for encoding a text.
10909
10910 If set, this function is called to force a user to select a proper
10911 coding system which can encode the text in the case that a default
10912 coding system used in each operation can't encode the text.  The
10913 function should take care that the buffer is not modified while
10914 the coding system is being selected.
10915
10916 The default value is `select-safe-coding-system' (which see).  */);
10917   Vselect_safe_coding_system_function = Qnil;
10918
10919   DEFVAR_BOOL ("coding-system-require-warning",
10920                &coding_system_require_warning,
10921                doc: /* Internal use only.
10922 If non-nil, on writing a file, `select-safe-coding-system-function' is
10923 called even if `coding-system-for-write' is non-nil.  The command
10924 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10925   coding_system_require_warning = 0;
10926
10927
10928   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10929                &inhibit_iso_escape_detection,
10930                doc: /*
10931 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10932
10933 When Emacs reads text, it tries to detect how the text is encoded.
10934 This code detection is sensitive to escape sequences.  If Emacs sees
10935 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10936 of the ISO2022 encodings, and decodes text by the corresponding coding
10937 system (e.g. `iso-2022-7bit').
10938
10939 However, there may be a case that you want to read escape sequences in
10940 a file as is.  In such a case, you can set this variable to non-nil.
10941 Then the code detection will ignore any escape sequences, and no text is
10942 detected as encoded in some ISO-2022 encoding.  The result is that all
10943 escape sequences become visible in a buffer.
10944
10945 The default value is nil, and it is strongly recommended not to change
10946 it.  That is because many Emacs Lisp source files that contain
10947 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10948 in Emacs's distribution, and they won't be decoded correctly on
10949 reading if you suppress escape sequence detection.
10950
10951 The other way to read escape sequences in a file without decoding is
10952 to explicitly specify some coding system that doesn't use ISO-2022
10953 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10954   inhibit_iso_escape_detection = 0;
10955
10956   DEFVAR_BOOL ("inhibit-null-byte-detection",
10957                &inhibit_null_byte_detection,
10958                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10959 By default, Emacs treats it as binary data, and does not attempt to
10960 decode it.  The effect is as if you specified `no-conversion' for
10961 reading that text.
10962
10963 Set this to non-nil when a regular text happens to include null bytes.
10964 Examples are Index nodes of Info files and null-byte delimited output
10965 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10966 decode text as usual.  */);
10967   inhibit_null_byte_detection = 0;
10968
10969   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10970                doc: /* Char table for translating self-inserting characters.
10971 This is applied to the result of input methods, not their input.
10972 See also `keyboard-translate-table'.
10973
10974 Use of this variable for character code unification was rendered
10975 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10976 internal character representation.  */);
10977     Vtranslation_table_for_input = Qnil;
10978
10979   {
10980     Lisp_Object args[coding_arg_max];
10981     Lisp_Object plist[16];
10982     int i;
10983
10984     for (i = 0; i < coding_arg_max; i++)
10985       args[i] = Qnil;
10986
10987     plist[0] = intern_c_string (":name");
10988     plist[1] = args[coding_arg_name] = Qno_conversion;
10989     plist[2] = intern_c_string (":mnemonic");
10990     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10991     plist[4] = intern_c_string (":coding-type");
10992     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10993     plist[6] = intern_c_string (":ascii-compatible-p");
10994     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10995     plist[8] = intern_c_string (":default-char");
10996     plist[9] = args[coding_arg_default_char] = make_number (0);
10997     plist[10] = intern_c_string (":for-unibyte");
10998     plist[11] = args[coding_arg_for_unibyte] = Qt;
10999     plist[12] = intern_c_string (":docstring");
11000     plist[13] = make_pure_c_string ("Do no conversion.\n\
11001 \n\
11002 When you visit a file with this coding, the file is read into a\n\
11003 unibyte buffer as is, thus each byte of a file is treated as a\n\
11004 character.");
11005     plist[14] = intern_c_string (":eol-type");
11006     plist[15] = args[coding_arg_eol_type] = Qunix;
11007     args[coding_arg_plist] = Flist (16, plist);
11008     Fdefine_coding_system_internal (coding_arg_max, args);
11009
11010     plist[1] = args[coding_arg_name] = Qundecided;
11011     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11012     plist[5] = args[coding_arg_coding_type] = Qundecided;
11013     /* This is already set.
11014        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11015     plist[8] = intern_c_string (":charset-list");
11016     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11017     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11018     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11019     plist[15] = args[coding_arg_eol_type] = Qnil;
11020     args[coding_arg_plist] = Flist (16, plist);
11021     Fdefine_coding_system_internal (coding_arg_max, args);
11022   }
11023
11024   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11025
11026   {
11027     int i;
11028
11029     for (i = 0; i < coding_category_max; i++)
11030       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11031   }
11032 #if defined (MSDOS) || defined (WINDOWSNT)
11033   system_eol_type = Qdos;
11034 #else
11035   system_eol_type = Qunix;
11036 #endif
11037   staticpro (&system_eol_type);
11038 }
11039
11040 char *
11041 emacs_strerror (error_number)
11042      int error_number;
11043 {
11044   char *str;
11045
11046   synchronize_system_messages_locale ();
11047   str = strerror (error_number);
11048
11049   if (! NILP (Vlocale_coding_system))
11050     {
11051       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11052                                                       Vlocale_coding_system,
11053                                                       0);
11054       str = (char *) SDATA (dec);
11055     }
11056
11057   return str;
11058 }
11059
11060 #endif /* emacs */
11061
11062 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
11063    (do not change this comment) */