src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292 #include <setjmp.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302 #include "frame.h"
 303 #include "termhooks.h"
 304
 305 Lisp_Object Vcoding_system_hash_table;
 306
 307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 308 Lisp_Object Qunix, Qdos;
 309 extern Lisp_Object Qmac;        /* frame.c */
 310 Lisp_Object Qbuffer_file_coding_system;
 311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 315 Lisp_Object Qbig, Qlittle;
 316 Lisp_Object Qcoding_system_history;
 317 Lisp_Object Qvalid_codes;
 318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 321 Lisp_Object QCascii_compatible_p;
 322
 323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 324 Lisp_Object Qcall_process, Qcall_process_region;
 325 Lisp_Object Qstart_process, Qopen_network_stream;
 326 Lisp_Object Qtarget_idx;
 327
 328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 329 Lisp_Object Qinterrupted, Qinsufficient_memory;
 330
 331 extern Lisp_Object Qcompletion_ignore_case;
 332
 333 /* If a symbol has this property, evaluate the value to define the
 334    symbol as a coding system.  */
 335 static Lisp_Object Qcoding_system_define_form;
 336
 337 int coding_system_require_warning;
 338
 339 Lisp_Object Vselect_safe_coding_system_function;
 340
 341 /* Mnemonic string for each format of end-of-line.  */
 342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 343 /* Mnemonic string to indicate format of end-of-line is not yet
 344    decided.  */
 345 Lisp_Object eol_mnemonic_undecided;
 346
 347 /* Format of end-of-line decided by system.  This is Qunix on
 348    Unix and Mac, Qdos on DOS/Windows.
 349    This has an effect only for external encoding (i.e. for output to
 350    file and process), not for in-buffer or Lisp string encoding.  */
 351 static Lisp_Object system_eol_type;
 352
 353 #ifdef emacs
 354
 355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 356
 357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 358
 359 /* Coding system emacs-mule and raw-text are for converting only
 360    end-of-line format.  */
 361 Lisp_Object Qemacs_mule, Qraw_text;
 362 Lisp_Object Qutf_8_emacs;
 363
 364 /* Coding-systems are handed between Emacs Lisp programs and C internal
 365    routines by the following three variables.  */
 366 /* Coding-system for reading files and receiving data from process.  */
 367 Lisp_Object Vcoding_system_for_read;
 368 /* Coding-system for writing files and sending data to process.  */
 369 Lisp_Object Vcoding_system_for_write;
 370 /* Coding-system actually used in the latest I/O.  */
 371 Lisp_Object Vlast_coding_system_used;
 372 /* Set to non-nil when an error is detected while code conversion.  */
 373 Lisp_Object Vlast_code_conversion_error;
 374 /* A vector of length 256 which contains information about special
 375    Latin codes (especially for dealing with Microsoft codes).  */
 376 Lisp_Object Vlatin_extra_code_table;
 377
 378 /* Flag to inhibit code conversion of end-of-line format.  */
 379 int inhibit_eol_conversion;
 380
 381 /* Flag to inhibit ISO2022 escape sequence detection.  */
 382 int inhibit_iso_escape_detection;
 383
 384 /* Flag to inhibit detection of binary files through null bytes.  */
 385 int inhibit_null_byte_detection;
 386
 387 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 388 int inherit_process_coding_system;
 389
 390 /* Coding system to be used to encode text for terminal display when
 391    terminal coding system is nil.  */
 392 struct coding_system safe_terminal_coding;
 393
 394 Lisp_Object Vfile_coding_system_alist;
 395 Lisp_Object Vprocess_coding_system_alist;
 396 Lisp_Object Vnetwork_coding_system_alist;
 397
 398 Lisp_Object Vlocale_coding_system;
 399
 400 #endif /* emacs */
 401
 402 /* Flag to tell if we look up translation table on character code
 403    conversion.  */
 404 Lisp_Object Venable_character_translation;
 405 /* Standard translation table to look up on decoding (reading).  */
 406 Lisp_Object Vstandard_translation_table_for_decode;
 407 /* Standard translation table to look up on encoding (writing).  */
 408 Lisp_Object Vstandard_translation_table_for_encode;
 409
 410 Lisp_Object Qtranslation_table;
 411 Lisp_Object Qtranslation_table_id;
 412 Lisp_Object Qtranslation_table_for_decode;
 413 Lisp_Object Qtranslation_table_for_encode;
 414
 415 /* Alist of charsets vs revision number.  */
 416 static Lisp_Object Vcharset_revision_table;
 417
 418 /* Default coding systems used for process I/O.  */
 419 Lisp_Object Vdefault_process_coding_system;
 420
 421 /* Char table for translating Quail and self-inserting input.  */
 422 Lisp_Object Vtranslation_table_for_input;
 423
 424 /* Two special coding systems.  */
 425 Lisp_Object Vsjis_coding_system;
 426 Lisp_Object Vbig5_coding_system;
 427
 428 /* ISO2022 section */
 429
 430 #define CODING_ISO_INITIAL(coding, reg)                 \
 431   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 432                      coding_attr_iso_initial),          \
 433                reg)))
 434
 435
 436 #define CODING_ISO_REQUEST(coding, charset_id)          \
 437   (((charset_id) <= (coding)->max_charset_id            \
 438     ? ((coding)->safe_charsets[charset_id] != 255       \
 439        ? (coding)->safe_charsets[charset_id]            \
 440        : -1)                                            \
 441     : -1))
 442
 443
 444 #define CODING_ISO_FLAGS(coding)        \
 445   ((coding)->spec.iso_2022.flags)
 446 #define CODING_ISO_DESIGNATION(coding, reg)     \
 447   ((coding)->spec.iso_2022.current_designation[reg])
 448 #define CODING_ISO_INVOCATION(coding, plane)    \
 449   ((coding)->spec.iso_2022.current_invocation[plane])
 450 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 451   ((coding)->spec.iso_2022.single_shifting)
 452 #define CODING_ISO_BOL(coding)  \
 453   ((coding)->spec.iso_2022.bol)
 454 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 455   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 456 #define CODING_ISO_CMP_STATUS(coding)   \
 457   (&(coding)->spec.iso_2022.cmp_status)
 458 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 459   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 460 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 461   ((coding)->spec.iso_2022.embedded_utf_8)
 462
 463 /* Control characters of ISO2022.  */
 464                         /* code */      /* function */
 465 #define ISO_CODE_LF     0x0A            /* line-feed */
 466 #define ISO_CODE_CR     0x0D            /* carriage-return */
 467 #define ISO_CODE_SO     0x0E            /* shift-out */
 468 #define ISO_CODE_SI     0x0F            /* shift-in */
 469 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 470 #define ISO_CODE_ESC    0x1B            /* escape */
 471 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 472 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 473 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 474
 475 /* All code (1-byte) of ISO2022 is classified into one of the
 476    followings.  */
 477 enum iso_code_class_type
 478   {
 479     ISO_control_0,              /* Control codes in the range
 480                                    0x00..0x1F and 0x7F, except for the
 481                                    following 5 codes.  */
 482     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 483     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 484     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 485     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 486     ISO_control_1,              /* Control codes in the range
 487                                    0x80..0x9F, except for the
 488                                    following 3 codes.  */
 489     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 490     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 491     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 492     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 493     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 494     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 495     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 496   };
 497
 498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 499     `iso-flags' attribute of an iso2022 coding system.  */
 500
 501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 502    instead of the correct short-form sequence (e.g. ESC $ A).  */
 503 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 504
 505 /* If set, reset graphic planes and registers at end-of-line to the
 506    initial state.  */
 507 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 508
 509 /* If set, reset graphic planes and registers before any control
 510    characters to the initial state.  */
 511 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 512
 513 /* If set, encode by 7-bit environment.  */
 514 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 515
 516 /* If set, use locking-shift function.  */
 517 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 518
 519 /* If set, use single-shift function.  Overwrite
 520    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 521 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 522
 523 /* If set, use designation escape sequence.  */
 524 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 525
 526 /* If set, produce revision number sequence.  */
 527 #define CODING_ISO_FLAG_REVISION        0x0080
 528
 529 /* If set, produce ISO6429's direction specifying sequence.  */
 530 #define CODING_ISO_FLAG_DIRECTION       0x0100
 531
 532 /* If set, assume designation states are reset at beginning of line on
 533    output.  */
 534 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 535
 536 /* If set, designation sequence should be placed at beginning of line
 537    on output.  */
 538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 539
 540 /* If set, do not encode unsafe charactes on output.  */
 541 #define CODING_ISO_FLAG_SAFE            0x0800
 542
 543 /* If set, extra latin codes (128..159) are accepted as a valid code
 544    on input.  */
 545 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 546
 547 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 548
 549 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 550
 551 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 552
 553 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 554
 555 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 556
 557 /* A character to be produced on output if encoding of the original
 558    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 560
 561 /* UTF-8 section */
 562 #define CODING_UTF_8_BOM(coding)        \
 563   ((coding)->spec.utf_8_bom)
 564
 565 /* UTF-16 section */
 566 #define CODING_UTF_16_BOM(coding)       \
 567   ((coding)->spec.utf_16.bom)
 568
 569 #define CODING_UTF_16_ENDIAN(coding)    \
 570   ((coding)->spec.utf_16.endian)
 571
 572 #define CODING_UTF_16_SURROGATE(coding) \
 573   ((coding)->spec.utf_16.surrogate)
 574
 575
 576 /* CCL section */
 577 #define CODING_CCL_DECODER(coding)      \
 578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 579 #define CODING_CCL_ENCODER(coding)      \
 580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 581 #define CODING_CCL_VALIDS(coding)                                          \
 582   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 583
 584 /* Index for each coding category in `coding_categories' */
 585
 586 enum coding_category
 587   {
 588     coding_category_iso_7,
 589     coding_category_iso_7_tight,
 590     coding_category_iso_8_1,
 591     coding_category_iso_8_2,
 592     coding_category_iso_7_else,
 593     coding_category_iso_8_else,
 594     coding_category_utf_8_auto,
 595     coding_category_utf_8_nosig,
 596     coding_category_utf_8_sig,
 597     coding_category_utf_16_auto,
 598     coding_category_utf_16_be,
 599     coding_category_utf_16_le,
 600     coding_category_utf_16_be_nosig,
 601     coding_category_utf_16_le_nosig,
 602     coding_category_charset,
 603     coding_category_sjis,
 604     coding_category_big5,
 605     coding_category_ccl,
 606     coding_category_emacs_mule,
 607     /* All above are targets of code detection.  */
 608     coding_category_raw_text,
 609     coding_category_undecided,
 610     coding_category_max
 611   };
 612
 613 /* Definitions of flag bits used in detect_coding_XXXX.  */
 614 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 615 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 616 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 617 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 618 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 619 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 620 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 621 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 622 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 623 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8_AUTO           \
 645    | CATEGORY_MASK_UTF_8_NOSIG          \
 646    | CATEGORY_MASK_UTF_8_SIG            \
 647    | CATEGORY_MASK_UTF_16_AUTO          \
 648    | CATEGORY_MASK_UTF_16_BE            \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 652    | CATEGORY_MASK_CHARSET              \
 653    | CATEGORY_MASK_SJIS                 \
 654    | CATEGORY_MASK_BIG5                 \
 655    | CATEGORY_MASK_CCL                  \
 656    | CATEGORY_MASK_EMACS_MULE)
 657
 658
 659 #define CATEGORY_MASK_ISO_7BIT \
 660   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 661
 662 #define CATEGORY_MASK_ISO_8BIT \
 663   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 664
 665 #define CATEGORY_MASK_ISO_ELSE \
 666   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 667
 668 #define CATEGORY_MASK_ISO_ESCAPE        \
 669   (CATEGORY_MASK_ISO_7                  \
 670    | CATEGORY_MASK_ISO_7_TIGHT          \
 671    | CATEGORY_MASK_ISO_7_ELSE           \
 672    | CATEGORY_MASK_ISO_8_ELSE)
 673
 674 #define CATEGORY_MASK_ISO       \
 675   (  CATEGORY_MASK_ISO_7BIT     \
 676      | CATEGORY_MASK_ISO_8BIT   \
 677      | CATEGORY_MASK_ISO_ELSE)
 678
 679 #define CATEGORY_MASK_UTF_16            \
 680   (CATEGORY_MASK_UTF_16_AUTO            \
 681    | CATEGORY_MASK_UTF_16_BE            \
 682    | CATEGORY_MASK_UTF_16_LE            \
 683    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 684    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 685
 686 #define CATEGORY_MASK_UTF_8     \
 687   (CATEGORY_MASK_UTF_8_AUTO     \
 688    | CATEGORY_MASK_UTF_8_NOSIG  \
 689    | CATEGORY_MASK_UTF_8_SIG)
 690
 691 /* List of symbols `coding-category-xxx' ordered by priority.  This
 692    variable is exposed to Emacs Lisp.  */
 693 static Lisp_Object Vcoding_category_list;
 694
 695 /* Table of coding categories (Lisp symbols).  This variable is for
 696    internal use oly.  */
 697 static Lisp_Object Vcoding_category_table;
 698
 699 /* Table of coding-categories ordered by priority.  */
 700 static enum coding_category coding_priorities[coding_category_max];
 701
 702 /* Nth element is a coding context for the coding system bound to the
 703    Nth coding category.  */
 704 static struct coding_system coding_categories[coding_category_max];
 705
 706 /*** Commonly used macros and functions ***/
 707
 708 #ifndef min
 709 #define min(a, b) ((a) < (b) ? (a) : (b))
 710 #endif
 711 #ifndef max
 712 #define max(a, b) ((a) > (b) ? (a) : (b))
 713 #endif
 714
 715 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 716   do {                                                  \
 717     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 718     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 719   } while (0)
 720
 721
 722 /* Safely get one byte from the source text pointed by SRC which ends
 723    at SRC_END, and set C to that byte.  If there are not enough bytes
 724    in the source, it jumps to `no_more_source'.  If multibytep is
 725    nonzero, and a multibyte character is found at SRC, set C to the
 726    negative value of the character code.  The caller should declare
 727    and set these variables appropriately in advance:
 728         src, src_end, multibytep */
 729
 730 #define ONE_MORE_BYTE(c)                                \
 731   do {                                                  \
 732     if (src == src_end)                                 \
 733       {                                                 \
 734         if (src_base < src)                             \
 735           record_conversion_result                      \
 736             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 737         goto no_more_source;                            \
 738       }                                                 \
 739     c = *src++;                                         \
 740     if (multibytep && (c & 0x80))                       \
 741       {                                                 \
 742         if ((c & 0xFE) == 0xC0)                         \
 743           c = ((c & 1) << 6) | *src++;                  \
 744         else                                            \
 745           {                                             \
 746             src--;                                      \
 747             c = - string_char (src, &src, NULL);        \
 748             record_conversion_result                    \
 749               (coding, CODING_RESULT_INVALID_SRC);      \
 750           }                                             \
 751       }                                                 \
 752     consumed_chars++;                                   \
 753   } while (0)
 754
 755 /* Safely get two bytes from the source text pointed by SRC which ends
 756    at SRC_END, and set C1 and C2 to those bytes while skipping the
 757    heading multibyte characters.  If there are not enough bytes in the
 758    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 759    a multibyte character is found for C2, set C2 to the negative value
 760    of the character code.  The caller should declare and set these
 761    variables appropriately in advance:
 762         src, src_end, multibytep
 763    It is intended that this macro is used in detect_coding_utf_16.  */
 764
 765 #define TWO_MORE_BYTES(c1, c2)                          \
 766   do {                                                  \
 767     do {                                                \
 768       if (src == src_end)                               \
 769         goto no_more_source;                            \
 770       c1 = *src++;                                      \
 771       if (multibytep && (c1 & 0x80))                    \
 772         {                                               \
 773           if ((c1 & 0xFE) == 0xC0)                      \
 774             c1 = ((c1 & 1) << 6) | *src++;              \
 775           else                                          \
 776             {                                           \
 777               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 778               c1 = -1;                                  \
 779             }                                           \
 780         }                                               \
 781     } while (c1 < 0);                                   \
 782     if (src == src_end)                                 \
 783       goto no_more_source;                              \
 784     c2 = *src++;                                        \
 785     if (multibytep && (c2 & 0x80))                      \
 786       {                                                 \
 787         if ((c2 & 0xFE) == 0xC0)                        \
 788           c2 = ((c2 & 1) << 6) | *src++;                \
 789         else                                            \
 790           c2 = -1;                                      \
 791       }                                                 \
 792   } while (0)
 793
 794
 795 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 796   do {                                                  \
 797     c = *src++;                                         \
 798     if (multibytep && (c & 0x80))                       \
 799       {                                                 \
 800         if ((c & 0xFE) == 0xC0)                         \
 801           c = ((c & 1) << 6) | *src++;                  \
 802         else                                            \
 803           {                                             \
 804             src--;                                      \
 805             c = - string_char (src, &src, NULL);        \
 806             record_conversion_result                    \
 807               (coding, CODING_RESULT_INVALID_SRC);      \
 808           }                                             \
 809       }                                                 \
 810     consumed_chars++;                                   \
 811   } while (0)
 812
 813
 814 /* Store a byte C in the place pointed by DST and increment DST to the
 815    next free point, and increment PRODUCED_CHARS.  The caller should
 816    assure that C is 0..127, and declare and set the variable `dst'
 817    appropriately in advance.
 818 */
 819
 820
 821 #define EMIT_ONE_ASCII_BYTE(c)  \
 822   do {                          \
 823     produced_chars++;           \
 824     *dst++ = (c);               \
 825   } while (0)
 826
 827
 828 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 829
 830 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 831   do {                                  \
 832     produced_chars += 2;                \
 833     *dst++ = (c1), *dst++ = (c2);       \
 834   } while (0)
 835
 836
 837 /* Store a byte C in the place pointed by DST and increment DST to the
 838    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 839    nonzero, store in an appropriate multibyte from.  The caller should
 840    declare and set the variables `dst' and `multibytep' appropriately
 841    in advance.  */
 842
 843 #define EMIT_ONE_BYTE(c)                \
 844   do {                                  \
 845     produced_chars++;                   \
 846     if (multibytep)                     \
 847       {                                 \
 848         int ch = (c);                   \
 849         if (ch >= 0x80)                 \
 850           ch = BYTE8_TO_CHAR (ch);      \
 851         CHAR_STRING_ADVANCE (ch, dst);  \
 852       }                                 \
 853     else                                \
 854       *dst++ = (c);                     \
 855   } while (0)
 856
 857
 858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 859
 860 #define EMIT_TWO_BYTES(c1, c2)          \
 861   do {                                  \
 862     produced_chars += 2;                \
 863     if (multibytep)                     \
 864       {                                 \
 865         int ch;                         \
 866                                         \
 867         ch = (c1);                      \
 868         if (ch >= 0x80)                 \
 869           ch = BYTE8_TO_CHAR (ch);      \
 870         CHAR_STRING_ADVANCE (ch, dst);  \
 871         ch = (c2);                      \
 872         if (ch >= 0x80)                 \
 873           ch = BYTE8_TO_CHAR (ch);      \
 874         CHAR_STRING_ADVANCE (ch, dst);  \
 875       }                                 \
 876     else                                \
 877       {                                 \
 878         *dst++ = (c1);                  \
 879         *dst++ = (c2);                  \
 880       }                                 \
 881   } while (0)
 882
 883
 884 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 885   do {                                  \
 886     EMIT_ONE_BYTE (c1);                 \
 887     EMIT_TWO_BYTES (c2, c3);            \
 888   } while (0)
 889
 890
 891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 892   do {                                          \
 893     EMIT_TWO_BYTES (c1, c2);                    \
 894     EMIT_TWO_BYTES (c3, c4);                    \
 895   } while (0)
 896
 897
 898 /* Prototypes for static functions.  */
 899 static void record_conversion_result P_ ((struct coding_system *coding,
 900                                           enum coding_result_code result));
 901 static int detect_coding_utf_8 P_ ((struct coding_system *,
 902                                     struct coding_detection_info *info));
 903 static void decode_coding_utf_8 P_ ((struct coding_system *));
 904 static int encode_coding_utf_8 P_ ((struct coding_system *));
 905
 906 static int detect_coding_utf_16 P_ ((struct coding_system *,
 907                                      struct coding_detection_info *info));
 908 static void decode_coding_utf_16 P_ ((struct coding_system *));
 909 static int encode_coding_utf_16 P_ ((struct coding_system *));
 910
 911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 912                                        struct coding_detection_info *info));
 913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 915
 916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 917                                          struct coding_detection_info *info));
 918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 920
 921 static int detect_coding_sjis P_ ((struct coding_system *,
 922                                    struct coding_detection_info *info));
 923 static void decode_coding_sjis P_ ((struct coding_system *));
 924 static int encode_coding_sjis P_ ((struct coding_system *));
 925
 926 static int detect_coding_big5 P_ ((struct coding_system *,
 927                                    struct coding_detection_info *info));
 928 static void decode_coding_big5 P_ ((struct coding_system *));
 929 static int encode_coding_big5 P_ ((struct coding_system *));
 930
 931 static int detect_coding_ccl P_ ((struct coding_system *,
 932                                   struct coding_detection_info *info));
 933 static void decode_coding_ccl P_ ((struct coding_system *));
 934 static int encode_coding_ccl P_ ((struct coding_system *));
 935
 936 static void decode_coding_raw_text P_ ((struct coding_system *));
 937 static int encode_coding_raw_text P_ ((struct coding_system *));
 938
 939 static void coding_set_source P_ ((struct coding_system *));
 940 static void coding_set_destination P_ ((struct coding_system *));
 941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 943                                             EMACS_INT, EMACS_INT));
 944 static unsigned char *alloc_destination P_ ((struct coding_system *,
 945                                              EMACS_INT, unsigned char *));
 946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 947 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 948                                                      int *, int *,
 949                                                      unsigned char *));
 950 static int detect_eol P_ ((const unsigned char *,
 951                            EMACS_INT, enum coding_category));
 952 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 953 static void decode_eol P_ ((struct coding_system *));
 954 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 955 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 956 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 957 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 958                                         EMACS_INT));
 959 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 960 static int decode_coding P_ ((struct coding_system *));
 961 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 962                                                       struct coding_system *,
 963                                                       int *, EMACS_INT *));
 964 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 965                                                   struct coding_system *,
 966                                                   int *, EMACS_INT *));
 967 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 968 static int encode_coding P_ ((struct coding_system *));
 969 static Lisp_Object make_conversion_work_buffer P_ ((int));
 970 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 971 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 972 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 973
 974 static void
 975 record_conversion_result (struct coding_system *coding,
 976                           enum coding_result_code result)
 977 {
 978   coding->result = result;
 979   switch (result)
 980     {
 981     case CODING_RESULT_INSUFFICIENT_SRC:
 982       Vlast_code_conversion_error = Qinsufficient_source;
 983       break;
 984     case CODING_RESULT_INCONSISTENT_EOL:
 985       Vlast_code_conversion_error = Qinconsistent_eol;
 986       break;
 987     case CODING_RESULT_INVALID_SRC:
 988       Vlast_code_conversion_error = Qinvalid_source;
 989       break;
 990     case CODING_RESULT_INTERRUPT:
 991       Vlast_code_conversion_error = Qinterrupted;
 992       break;
 993     case CODING_RESULT_INSUFFICIENT_MEM:
 994       Vlast_code_conversion_error = Qinsufficient_memory;
 995       break;
 996     case CODING_RESULT_INSUFFICIENT_DST:
 997       /* Don't record this error in Vlast_code_conversion_error
 998          because it happens just temporarily and is resolved when the
 999          whole conversion is finished.  */
1000       break;
1001     case CODING_RESULT_SUCCESS:
1002       break;
1003     default:
1004       Vlast_code_conversion_error = intern ("Unknown error");
1005     }
1006 }
1007
1008 /* This wrapper macro is used to preserve validity of pointers into
1009    buffer text across calls to decode_char, which could cause
1010    relocation of buffers if it loads a charset map, because loading a
1011    charset map allocates large structures.  */
1012 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1013   do {                                                                       \
1014     charset_map_loaded = 0;                                                  \
1015     c = DECODE_CHAR (charset, code);                                         \
1016     if (charset_map_loaded)                                                  \
1017       {                                                                      \
1018         const unsigned char *orig = coding->source;                          \
1019         EMACS_INT offset;                                                    \
1020                                                                              \
1021         coding_set_source (coding);                                          \
1022         offset = coding->source - orig;                                      \
1023         src += offset;                                                       \
1024         src_base += offset;                                                  \
1025         src_end += offset;                                                   \
1026       }                                                                      \
1027   } while (0)
1028
1029
1030 /* If there are at least BYTES length of room at dst, allocate memory
1031    for coding->destination and update dst and dst_end.  We don't have
1032    to take care of coding->source which will be relocated.  It is
1033    handled by calling coding_set_source in encode_coding.  */
1034
1035 #define ASSURE_DESTINATION(bytes)                               \
1036   do {                                                          \
1037     if (dst + (bytes) >= dst_end)                               \
1038       {                                                         \
1039         int more_bytes = charbuf_end - charbuf + (bytes);       \
1040                                                                 \
1041         dst = alloc_destination (coding, more_bytes, dst);      \
1042         dst_end = coding->destination + coding->dst_bytes;      \
1043       }                                                         \
1044   } while (0)
1045
1046
1047 /* Store multibyte form of the character C in P, and advance P to the
1048    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1049    never calls MAYBE_UNIFY_CHAR.  */
1050
1051 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1052   do {                                          \
1053     if ((c) <= MAX_1_BYTE_CHAR)                 \
1054       *(p)++ = (c);                             \
1055     else if ((c) <= MAX_2_BYTE_CHAR)            \
1056       *(p)++ = (0xC0 | ((c) >> 6)),             \
1057         *(p)++ = (0x80 | ((c) & 0x3F));         \
1058     else if ((c) <= MAX_3_BYTE_CHAR)            \
1059       *(p)++ = (0xE0 | ((c) >> 12)),            \
1060         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1061         *(p)++ = (0x80 | ((c) & 0x3F));         \
1062     else if ((c) <= MAX_4_BYTE_CHAR)            \
1063       *(p)++ = (0xF0 | (c >> 18)),              \
1064         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1065         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1066         *(p)++ = (0x80 | (c & 0x3F));           \
1067     else if ((c) <= MAX_5_BYTE_CHAR)            \
1068       *(p)++ = 0xF8,                            \
1069         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1070         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1071         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1072         *(p)++ = (0x80 | (c & 0x3F));           \
1073     else                                        \
1074       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1075   } while (0)
1076
1077
1078 /* Return the character code of character whose multibyte form is at
1079    P, and advance P to the end of the multibyte form.  This is like
1080    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1081
1082 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1083   (!((p)[0] & 0x80)                                             \
1084    ? *(p)++                                                     \
1085    : ! ((p)[0] & 0x20)                                          \
1086    ? ((p) += 2,                                                 \
1087       ((((p)[-2] & 0x1F) << 6)                                  \
1088        | ((p)[-1] & 0x3F)                                       \
1089        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1090    : ! ((p)[0] & 0x10)                                          \
1091    ? ((p) += 3,                                                 \
1092       ((((p)[-3] & 0x0F) << 12)                                 \
1093        | (((p)[-2] & 0x3F) << 6)                                \
1094        | ((p)[-1] & 0x3F)))                                     \
1095    : ! ((p)[0] & 0x08)                                          \
1096    ? ((p) += 4,                                                 \
1097       ((((p)[-4] & 0xF) << 18)                                  \
1098        | (((p)[-3] & 0x3F) << 12)                               \
1099        | (((p)[-2] & 0x3F) << 6)                                \
1100        | ((p)[-1] & 0x3F)))                                     \
1101    : ((p) += 5,                                                 \
1102       ((((p)[-4] & 0x3F) << 18)                                 \
1103        | (((p)[-3] & 0x3F) << 12)                               \
1104        | (((p)[-2] & 0x3F) << 6)                                \
1105        | ((p)[-1] & 0x3F))))
1106
1107
1108 static void
1109 coding_set_source (coding)
1110      struct coding_system *coding;
1111 {
1112   if (BUFFERP (coding->src_object))
1113     {
1114       struct buffer *buf = XBUFFER (coding->src_object);
1115
1116       if (coding->src_pos < 0)
1117         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1118       else
1119         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1120     }
1121   else if (STRINGP (coding->src_object))
1122     {
1123       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1124     }
1125   else
1126     /* Otherwise, the source is C string and is never relocated
1127        automatically.  Thus we don't have to update anything.  */
1128     ;
1129 }
1130
1131 static void
1132 coding_set_destination (coding)
1133      struct coding_system *coding;
1134 {
1135   if (BUFFERP (coding->dst_object))
1136     {
1137       if (coding->src_pos < 0)
1138         {
1139           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1140           coding->dst_bytes = (GAP_END_ADDR
1141                                - (coding->src_bytes - coding->consumed)
1142                                - coding->destination);
1143         }
1144       else
1145         {
1146           /* We are sure that coding->dst_pos_byte is before the gap
1147              of the buffer. */
1148           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1149                                  + coding->dst_pos_byte - BEG_BYTE);
1150           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1151                                - coding->destination);
1152         }
1153     }
1154   else
1155     /* Otherwise, the destination is C string and is never relocated
1156        automatically.  Thus we don't have to update anything.  */
1157     ;
1158 }
1159
1160
1161 static void
1162 coding_alloc_by_realloc (coding, bytes)
1163      struct coding_system *coding;
1164      EMACS_INT bytes;
1165 {
1166   coding->destination = (unsigned char *) xrealloc (coding->destination,
1167                                                     coding->dst_bytes + bytes);
1168   coding->dst_bytes += bytes;
1169 }
1170
1171 static void
1172 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1173      struct coding_system *coding;
1174      EMACS_INT gap_head_used, bytes;
1175 {
1176   if (EQ (coding->src_object, coding->dst_object))
1177     {
1178       /* The gap may contain the produced data at the head and not-yet
1179          consumed data at the tail.  To preserve those data, we at
1180          first make the gap size to zero, then increase the gap
1181          size.  */
1182       EMACS_INT add = GAP_SIZE;
1183
1184       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1185       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1186       make_gap (bytes);
1187       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1188       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1189     }
1190   else
1191     {
1192       Lisp_Object this_buffer;
1193
1194       this_buffer = Fcurrent_buffer ();
1195       set_buffer_internal (XBUFFER (coding->dst_object));
1196       make_gap (bytes);
1197       set_buffer_internal (XBUFFER (this_buffer));
1198     }
1199 }
1200
1201
1202 static unsigned char *
1203 alloc_destination (coding, nbytes, dst)
1204      struct coding_system *coding;
1205      EMACS_INT nbytes;
1206      unsigned char *dst;
1207 {
1208   EMACS_INT offset = dst - coding->destination;
1209
1210   if (BUFFERP (coding->dst_object))
1211     {
1212       struct buffer *buf = XBUFFER (coding->dst_object);
1213
1214       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1215     }
1216   else
1217     coding_alloc_by_realloc (coding, nbytes);
1218   coding_set_destination (coding);
1219   dst = coding->destination + offset;
1220   return dst;
1221 }
1222
1223 /** Macros for annotations.  */
1224
1225 /* An annotation data is stored in the array coding->charbuf in this
1226    format:
1227      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1228    LENGTH is the number of elements in the annotation.
1229    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1230    NCHARS is the number of characters in the text annotated.
1231
1232    The format of the following elements depend on ANNOTATION_MASK.
1233
1234    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1235    follows:
1236      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1237
1238    NBYTES is the number of bytes specified in the header part of
1239    old-style emacs-mule encoding, or 0 for the other kind of
1240    composition.
1241
1242    METHOD is one of enum composition_method.
1243
1244    Optionnal COMPOSITION-COMPONENTS are characters and composition
1245    rules.
1246
1247    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1248    follows.
1249
1250    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1251    recover from an invalid annotation, and should be skipped by
1252    produce_annotation.  */
1253
1254 /* Maximum length of the header of annotation data.  */
1255 #define MAX_ANNOTATION_LENGTH 5
1256
1257 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1258   do {                                                  \
1259     *(buf)++ = -(len);                                  \
1260     *(buf)++ = (mask);                                  \
1261     *(buf)++ = (nchars);                                \
1262     coding->annotated = 1;                              \
1263   } while (0);
1264
1265 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1266   do {                                                                      \
1267     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1268     *buf++ = nbytes;                                                        \
1269     *buf++ = method;                                                        \
1270   } while (0)
1271
1272
1273 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1274   do {                                                                  \
1275     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1276     *buf++ = id;                                                        \
1277   } while (0)
1278
1279 \f
1280 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1281
1282
1283
1284 \f
1285 /*** 3. UTF-8 ***/
1286
1287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1288    Check if a text is encoded in UTF-8.  If it is, return 1, else
1289    return 0.  */
1290
1291 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1292 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1293 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1294 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1295 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1296 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1297
1298 #define UTF_BOM 0xFEFF
1299 #define UTF_8_BOM_1 0xEF
1300 #define UTF_8_BOM_2 0xBB
1301 #define UTF_8_BOM_3 0xBF
1302
1303 static int
1304 detect_coding_utf_8 (coding, detect_info)
1305      struct coding_system *coding;
1306      struct coding_detection_info *detect_info;
1307 {
1308   const unsigned char *src = coding->source, *src_base;
1309   const unsigned char *src_end = coding->source + coding->src_bytes;
1310   int multibytep = coding->src_multibyte;
1311   int consumed_chars = 0;
1312   int bom_found = 0;
1313   int found = 0;
1314
1315   detect_info->checked |= CATEGORY_MASK_UTF_8;
1316   /* A coding system of this category is always ASCII compatible.  */
1317   src += coding->head_ascii;
1318
1319   while (1)
1320     {
1321       int c, c1, c2, c3, c4;
1322
1323       src_base = src;
1324       ONE_MORE_BYTE (c);
1325       if (c < 0 || UTF_8_1_OCTET_P (c))
1326         continue;
1327       ONE_MORE_BYTE (c1);
1328       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1329         break;
1330       if (UTF_8_2_OCTET_LEADING_P (c))
1331         {
1332           found = 1;
1333           continue;
1334         }
1335       ONE_MORE_BYTE (c2);
1336       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1337         break;
1338       if (UTF_8_3_OCTET_LEADING_P (c))
1339         {
1340           found = 1;
1341           if (src_base == coding->source
1342               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1343             bom_found = 1;
1344           continue;
1345         }
1346       ONE_MORE_BYTE (c3);
1347       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1348         break;
1349       if (UTF_8_4_OCTET_LEADING_P (c))
1350         {
1351           found = 1;
1352           continue;
1353         }
1354       ONE_MORE_BYTE (c4);
1355       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1356         break;
1357       if (UTF_8_5_OCTET_LEADING_P (c))
1358         {
1359           found = 1;
1360           continue;
1361         }
1362       break;
1363     }
1364   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1365   return 0;
1366
1367  no_more_source:
1368   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1369     {
1370       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1371       return 0;
1372     }
1373   if (bom_found)
1374     {
1375       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1376       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1377     }
1378   else
1379     {
1380       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1381       if (found)
1382         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1383     }
1384   return 1;
1385 }
1386
1387
1388 static void
1389 decode_coding_utf_8 (coding)
1390      struct coding_system *coding;
1391 {
1392   const unsigned char *src = coding->source + coding->consumed;
1393   const unsigned char *src_end = coding->source + coding->src_bytes;
1394   const unsigned char *src_base;
1395   int *charbuf = coding->charbuf + coding->charbuf_used;
1396   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1397   int consumed_chars = 0, consumed_chars_base = 0;
1398   int multibytep = coding->src_multibyte;
1399   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1400   Lisp_Object attr, charset_list;
1401   int eol_crlf =
1402     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1403   int byte_after_cr = -1;
1404
1405   CODING_GET_INFO (coding, attr, charset_list);
1406
1407   if (bom != utf_without_bom)
1408     {
1409       int c1, c2, c3;
1410
1411       src_base = src;
1412       ONE_MORE_BYTE (c1);
1413       if (! UTF_8_3_OCTET_LEADING_P (c1))
1414         src = src_base;
1415       else
1416         {
1417           ONE_MORE_BYTE (c2);
1418           if (! UTF_8_EXTRA_OCTET_P (c2))
1419             src = src_base;
1420           else
1421             {
1422               ONE_MORE_BYTE (c3);
1423               if (! UTF_8_EXTRA_OCTET_P (c3))
1424                 src = src_base;
1425               else
1426                 {
1427                   if ((c1 != UTF_8_BOM_1)
1428                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1429                     src = src_base;
1430                   else
1431                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1432                 }
1433             }
1434         }
1435     }
1436   CODING_UTF_8_BOM (coding) = utf_without_bom;
1437
1438
1439
1440   while (1)
1441     {
1442       int c, c1, c2, c3, c4, c5;
1443
1444       src_base = src;
1445       consumed_chars_base = consumed_chars;
1446
1447       if (charbuf >= charbuf_end)
1448         {
1449           if (byte_after_cr >= 0)
1450             src_base--;
1451           break;
1452         }
1453
1454       if (byte_after_cr >= 0)
1455         c1 = byte_after_cr, byte_after_cr = -1;
1456       else
1457         ONE_MORE_BYTE (c1);
1458       if (c1 < 0)
1459         {
1460           c = - c1;
1461         }
1462       else if (UTF_8_1_OCTET_P(c1))
1463         {
1464           if (eol_crlf && c1 == '\r')
1465             ONE_MORE_BYTE (byte_after_cr);
1466           c = c1;
1467         }
1468       else
1469         {
1470           ONE_MORE_BYTE (c2);
1471           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1472             goto invalid_code;
1473           if (UTF_8_2_OCTET_LEADING_P (c1))
1474             {
1475               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1476               /* Reject overlong sequences here and below.  Encoders
1477                  producing them are incorrect, they can be misleading,
1478                  and they mess up read/write invariance.  */
1479               if (c < 128)
1480                 goto invalid_code;
1481             }
1482           else
1483             {
1484               ONE_MORE_BYTE (c3);
1485               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1486                 goto invalid_code;
1487               if (UTF_8_3_OCTET_LEADING_P (c1))
1488                 {
1489                   c = (((c1 & 0xF) << 12)
1490                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1491                   if (c < 0x800
1492                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1493                     goto invalid_code;
1494                 }
1495               else
1496                 {
1497                   ONE_MORE_BYTE (c4);
1498                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1499                     goto invalid_code;
1500                   if (UTF_8_4_OCTET_LEADING_P (c1))
1501                     {
1502                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1503                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1504                     if (c < 0x10000)
1505                       goto invalid_code;
1506                     }
1507                   else
1508                     {
1509                       ONE_MORE_BYTE (c5);
1510                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1511                         goto invalid_code;
1512                       if (UTF_8_5_OCTET_LEADING_P (c1))
1513                         {
1514                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1515                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1516                                | (c5 & 0x3F));
1517                           if ((c > MAX_CHAR) || (c < 0x200000))
1518                             goto invalid_code;
1519                         }
1520                       else
1521                         goto invalid_code;
1522                     }
1523                 }
1524             }
1525         }
1526
1527       *charbuf++ = c;
1528       continue;
1529
1530     invalid_code:
1531       src = src_base;
1532       consumed_chars = consumed_chars_base;
1533       ONE_MORE_BYTE (c);
1534       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1535       coding->errors++;
1536     }
1537
1538  no_more_source:
1539   coding->consumed_char += consumed_chars_base;
1540   coding->consumed = src_base - coding->source;
1541   coding->charbuf_used = charbuf - coding->charbuf;
1542 }
1543
1544
1545 static int
1546 encode_coding_utf_8 (coding)
1547      struct coding_system *coding;
1548 {
1549   int multibytep = coding->dst_multibyte;
1550   int *charbuf = coding->charbuf;
1551   int *charbuf_end = charbuf + coding->charbuf_used;
1552   unsigned char *dst = coding->destination + coding->produced;
1553   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1554   int produced_chars = 0;
1555   int c;
1556
1557   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1558     {
1559       ASSURE_DESTINATION (3);
1560       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1561       CODING_UTF_8_BOM (coding) = utf_without_bom;
1562     }
1563
1564   if (multibytep)
1565     {
1566       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1567
1568       while (charbuf < charbuf_end)
1569         {
1570           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1571
1572           ASSURE_DESTINATION (safe_room);
1573           c = *charbuf++;
1574           if (CHAR_BYTE8_P (c))
1575             {
1576               c = CHAR_TO_BYTE8 (c);
1577               EMIT_ONE_BYTE (c);
1578             }
1579           else
1580             {
1581               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1582               for (p = str; p < pend; p++)
1583                 EMIT_ONE_BYTE (*p);
1584             }
1585         }
1586     }
1587   else
1588     {
1589       int safe_room = MAX_MULTIBYTE_LENGTH;
1590
1591       while (charbuf < charbuf_end)
1592         {
1593           ASSURE_DESTINATION (safe_room);
1594           c = *charbuf++;
1595           if (CHAR_BYTE8_P (c))
1596             *dst++ = CHAR_TO_BYTE8 (c);
1597           else
1598             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1599           produced_chars++;
1600         }
1601     }
1602   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1603   coding->produced_char += produced_chars;
1604   coding->produced = dst - coding->destination;
1605   return 0;
1606 }
1607
1608
1609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1610    Check if a text is encoded in one of UTF-16 based coding systems.
1611    If it is, return 1, else return 0.  */
1612
1613 #define UTF_16_HIGH_SURROGATE_P(val) \
1614   (((val) & 0xFC00) == 0xD800)
1615
1616 #define UTF_16_LOW_SURROGATE_P(val) \
1617   (((val) & 0xFC00) == 0xDC00)
1618
1619 #define UTF_16_INVALID_P(val)   \
1620   (((val) == 0xFFFE)            \
1621    || ((val) == 0xFFFF)         \
1622    || UTF_16_LOW_SURROGATE_P (val))
1623
1624
1625 static int
1626 detect_coding_utf_16 (coding, detect_info)
1627      struct coding_system *coding;
1628      struct coding_detection_info *detect_info;
1629 {
1630   const unsigned char *src = coding->source, *src_base = src;
1631   const unsigned char *src_end = coding->source + coding->src_bytes;
1632   int multibytep = coding->src_multibyte;
1633   int consumed_chars = 0;
1634   int c1, c2;
1635
1636   detect_info->checked |= CATEGORY_MASK_UTF_16;
1637   if (coding->mode & CODING_MODE_LAST_BLOCK
1638       && (coding->src_chars & 1))
1639     {
1640       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1641       return 0;
1642     }
1643
1644   TWO_MORE_BYTES (c1, c2);
1645   if ((c1 == 0xFF) && (c2 == 0xFE))
1646     {
1647       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1648                              | CATEGORY_MASK_UTF_16_AUTO);
1649       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1650                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1651                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1652     }
1653   else if ((c1 == 0xFE) && (c2 == 0xFF))
1654     {
1655       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1656                              | CATEGORY_MASK_UTF_16_AUTO);
1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1658                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1659                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1660     }
1661   else if (c2 < 0)
1662     {
1663       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1664       return 0;
1665     }
1666   else
1667     {
1668       /* We check the dispersion of Eth and Oth bytes where E is even and
1669          O is odd.  If both are high, we assume binary data.*/
1670       unsigned char e[256], o[256];
1671       unsigned e_num = 1, o_num = 1;
1672
1673       memset (e, 0, 256);
1674       memset (o, 0, 256);
1675       e[c1] = 1;
1676       o[c2] = 1;
1677
1678       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1679                                 |CATEGORY_MASK_UTF_16_BE
1680                                 | CATEGORY_MASK_UTF_16_LE);
1681
1682       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1683              != CATEGORY_MASK_UTF_16)
1684         {
1685           TWO_MORE_BYTES (c1, c2);
1686           if (c2 < 0)
1687             break;
1688           if (! e[c1])
1689             {
1690               e[c1] = 1;
1691               e_num++;
1692               if (e_num >= 128)
1693                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1694             }
1695           if (! o[c2])
1696             {
1697               o[c2] = 1;
1698               o_num++;
1699               if (o_num >= 128)
1700                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1701             }
1702         }
1703       return 0;
1704     }
1705
1706  no_more_source:
1707   return 1;
1708 }
1709
1710 static void
1711 decode_coding_utf_16 (coding)
1712      struct coding_system *coding;
1713 {
1714   const unsigned char *src = coding->source + coding->consumed;
1715   const unsigned char *src_end = coding->source + coding->src_bytes;
1716   const unsigned char *src_base;
1717   int *charbuf = coding->charbuf + coding->charbuf_used;
1718   /* We may produces at most 3 chars in one loop.  */
1719   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1720   int consumed_chars = 0, consumed_chars_base = 0;
1721   int multibytep = coding->src_multibyte;
1722   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1723   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1724   int surrogate = CODING_UTF_16_SURROGATE (coding);
1725   Lisp_Object attr, charset_list;
1726   int eol_crlf =
1727     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1728   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1729
1730   CODING_GET_INFO (coding, attr, charset_list);
1731
1732   if (bom == utf_with_bom)
1733     {
1734       int c, c1, c2;
1735
1736       src_base = src;
1737       ONE_MORE_BYTE (c1);
1738       ONE_MORE_BYTE (c2);
1739       c = (c1 << 8) | c2;
1740
1741       if (endian == utf_16_big_endian
1742           ? c != 0xFEFF : c != 0xFFFE)
1743         {
1744           /* The first two bytes are not BOM.  Treat them as bytes
1745              for a normal character.  */
1746           src = src_base;
1747           coding->errors++;
1748         }
1749       CODING_UTF_16_BOM (coding) = utf_without_bom;
1750     }
1751   else if (bom == utf_detect_bom)
1752     {
1753       /* We have already tried to detect BOM and failed in
1754          detect_coding.  */
1755       CODING_UTF_16_BOM (coding) = utf_without_bom;
1756     }
1757
1758   while (1)
1759     {
1760       int c, c1, c2;
1761
1762       src_base = src;
1763       consumed_chars_base = consumed_chars;
1764
1765       if (charbuf >= charbuf_end)
1766         {
1767           if (byte_after_cr1 >= 0)
1768             src_base -= 2;
1769           break;
1770         }
1771
1772       if (byte_after_cr1 >= 0)
1773         c1 = byte_after_cr1, byte_after_cr1 = -1;
1774       else
1775         ONE_MORE_BYTE (c1);
1776       if (c1 < 0)
1777         {
1778           *charbuf++ = -c1;
1779           continue;
1780         }
1781       if (byte_after_cr2 >= 0)
1782         c2 = byte_after_cr2, byte_after_cr2 = -1;
1783       else
1784         ONE_MORE_BYTE (c2);
1785       if (c2 < 0)
1786         {
1787           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1788           *charbuf++ = -c2;
1789           continue;
1790         }
1791       c = (endian == utf_16_big_endian
1792            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1793
1794       if (surrogate)
1795         {
1796           if (! UTF_16_LOW_SURROGATE_P (c))
1797             {
1798               if (endian == utf_16_big_endian)
1799                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1800               else
1801                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1802               *charbuf++ = c1;
1803               *charbuf++ = c2;
1804               coding->errors++;
1805               if (UTF_16_HIGH_SURROGATE_P (c))
1806                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1807               else
1808                 *charbuf++ = c;
1809             }
1810           else
1811             {
1812               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1813               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1814               *charbuf++ = 0x10000 + c;
1815             }
1816         }
1817       else
1818         {
1819           if (UTF_16_HIGH_SURROGATE_P (c))
1820             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1821           else
1822             {
1823               if (eol_crlf && c == '\r')
1824                 {
1825                   ONE_MORE_BYTE (byte_after_cr1);
1826                   ONE_MORE_BYTE (byte_after_cr2);
1827                 }
1828               *charbuf++ = c;
1829             }
1830         }
1831     }
1832
1833  no_more_source:
1834   coding->consumed_char += consumed_chars_base;
1835   coding->consumed = src_base - coding->source;
1836   coding->charbuf_used = charbuf - coding->charbuf;
1837 }
1838
1839 static int
1840 encode_coding_utf_16 (coding)
1841      struct coding_system *coding;
1842 {
1843   int multibytep = coding->dst_multibyte;
1844   int *charbuf = coding->charbuf;
1845   int *charbuf_end = charbuf + coding->charbuf_used;
1846   unsigned char *dst = coding->destination + coding->produced;
1847   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1848   int safe_room = 8;
1849   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1850   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1851   int produced_chars = 0;
1852   Lisp_Object attrs, charset_list;
1853   int c;
1854
1855   CODING_GET_INFO (coding, attrs, charset_list);
1856
1857   if (bom != utf_without_bom)
1858     {
1859       ASSURE_DESTINATION (safe_room);
1860       if (big_endian)
1861         EMIT_TWO_BYTES (0xFE, 0xFF);
1862       else
1863         EMIT_TWO_BYTES (0xFF, 0xFE);
1864       CODING_UTF_16_BOM (coding) = utf_without_bom;
1865     }
1866
1867   while (charbuf < charbuf_end)
1868     {
1869       ASSURE_DESTINATION (safe_room);
1870       c = *charbuf++;
1871       if (c > MAX_UNICODE_CHAR)
1872         c = coding->default_char;
1873
1874       if (c < 0x10000)
1875         {
1876           if (big_endian)
1877             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1878           else
1879             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1880         }
1881       else
1882         {
1883           int c1, c2;
1884
1885           c -= 0x10000;
1886           c1 = (c >> 10) + 0xD800;
1887           c2 = (c & 0x3FF) + 0xDC00;
1888           if (big_endian)
1889             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1890           else
1891             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1892         }
1893     }
1894   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1895   coding->produced = dst - coding->destination;
1896   coding->produced_char += produced_chars;
1897   return 0;
1898 }
1899
1900 \f
1901 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1902
1903 /* Emacs' internal format for representation of multiple character
1904    sets is a kind of multi-byte encoding, i.e. characters are
1905    represented by variable-length sequences of one-byte codes.
1906
1907    ASCII characters and control characters (e.g. `tab', `newline') are
1908    represented by one-byte sequences which are their ASCII codes, in
1909    the range 0x00 through 0x7F.
1910
1911    8-bit characters of the range 0x80..0x9F are represented by
1912    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1913    code + 0x20).
1914
1915    8-bit characters of the range 0xA0..0xFF are represented by
1916    one-byte sequences which are their 8-bit code.
1917
1918    The other characters are represented by a sequence of `base
1919    leading-code', optional `extended leading-code', and one or two
1920    `position-code's.  The length of the sequence is determined by the
1921    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1922    whereas extended leading-code and position-code take the range 0xA0
1923    through 0xFF.  See `charset.h' for more details about leading-code
1924    and position-code.
1925
1926    --- CODE RANGE of Emacs' internal format ---
1927    character set        range
1928    -------------        -----
1929    ascii                0x00..0x7F
1930    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1931    eight-bit-graphic    0xA0..0xBF
1932    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1933    ---------------------------------------------
1934
1935    As this is the internal character representation, the format is
1936    usually not used externally (i.e. in a file or in a data sent to a
1937    process).  But, it is possible to have a text externally in this
1938    format (i.e. by encoding by the coding system `emacs-mule').
1939
1940    In that case, a sequence of one-byte codes has a slightly different
1941    form.
1942
1943    At first, all characters in eight-bit-control are represented by
1944    one-byte sequences which are their 8-bit code.
1945
1946    Next, character composition data are represented by the byte
1947    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1948    where,
1949         METHOD is 0xF2 plus one of composition method (enum
1950         composition_method),
1951
1952         BYTES is 0xA0 plus a byte length of this composition data,
1953
1954         CHARS is 0xA0 plus a number of characters composed by this
1955         data,
1956
1957         COMPONENTs are characters of multibye form or composition
1958         rules encoded by two-byte of ASCII codes.
1959
1960    In addition, for backward compatibility, the following formats are
1961    also recognized as composition data on decoding.
1962
1963    0x80 MSEQ ...
1964    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1965
1966    Here,
1967         MSEQ is a multibyte form but in these special format:
1968           ASCII: 0xA0 ASCII_CODE+0x80,
1969           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1970         RULE is a one byte code of the range 0xA0..0xF0 that
1971         represents a composition rule.
1972   */
1973
1974 char emacs_mule_bytes[256];
1975
1976
1977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1978    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1979    else return 0.  */
1980
1981 static int
1982 detect_coding_emacs_mule (coding, detect_info)
1983      struct coding_system *coding;
1984      struct coding_detection_info *detect_info;
1985 {
1986   const unsigned char *src = coding->source, *src_base;
1987   const unsigned char *src_end = coding->source + coding->src_bytes;
1988   int multibytep = coding->src_multibyte;
1989   int consumed_chars = 0;
1990   int c;
1991   int found = 0;
1992
1993   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1994   /* A coding system of this category is always ASCII compatible.  */
1995   src += coding->head_ascii;
1996
1997   while (1)
1998     {
1999       src_base = src;
2000       ONE_MORE_BYTE (c);
2001       if (c < 0)
2002         continue;
2003       if (c == 0x80)
2004         {
2005           /* Perhaps the start of composite character.  We simply skip
2006              it because analyzing it is too heavy for detecting.  But,
2007              at least, we check that the composite character
2008              constitutes of more than 4 bytes.  */
2009           const unsigned char *src_base;
2010
2011         repeat:
2012           src_base = src;
2013           do
2014             {
2015               ONE_MORE_BYTE (c);
2016             }
2017           while (c >= 0xA0);
2018
2019           if (src - src_base <= 4)
2020             break;
2021           found = CATEGORY_MASK_EMACS_MULE;
2022           if (c == 0x80)
2023             goto repeat;
2024         }
2025
2026       if (c < 0x80)
2027         {
2028           if (c < 0x20
2029               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2030             break;
2031         }
2032       else
2033         {
2034           int more_bytes = emacs_mule_bytes[c] - 1;
2035
2036           while (more_bytes > 0)
2037             {
2038               ONE_MORE_BYTE (c);
2039               if (c < 0xA0)
2040                 {
2041                   src--;        /* Unread the last byte.  */
2042                   break;
2043                 }
2044               more_bytes--;
2045             }
2046           if (more_bytes != 0)
2047             break;
2048           found = CATEGORY_MASK_EMACS_MULE;
2049         }
2050     }
2051   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2052   return 0;
2053
2054  no_more_source:
2055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2056     {
2057       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2058       return 0;
2059     }
2060   detect_info->found |= found;
2061   return 1;
2062 }
2063
2064
2065 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2066    character.  If CMP_STATUS indicates that we must expect MSEQ or
2067    RULE described above, decode it and return the negative value of
2068    the decoded character or rule.  If an invalid byte is found, return
2069    -1.  If SRC is too short, return -2.  */
2070
2071 int
2072 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2073      struct coding_system *coding;
2074      const unsigned char *src;
2075      int *nbytes, *nchars, *id;
2076      struct composition_status *cmp_status;
2077 {
2078   const unsigned char *src_end = coding->source + coding->src_bytes;
2079   const unsigned char *src_base = src;
2080   int multibytep = coding->src_multibyte;
2081   struct charset *charset;
2082   unsigned code;
2083   int c;
2084   int consumed_chars = 0;
2085   int mseq_found = 0;
2086
2087   ONE_MORE_BYTE (c);
2088   if (c < 0)
2089     {
2090       c = -c;
2091       charset = emacs_mule_charset[0];
2092     }
2093   else
2094     {
2095       if (c >= 0xA0)
2096         {
2097           if (cmp_status->state != COMPOSING_NO
2098               && cmp_status->old_form)
2099             {
2100               if (cmp_status->state == COMPOSING_CHAR)
2101                 {
2102                   if (c == 0xA0)
2103                     {
2104                       ONE_MORE_BYTE (c);
2105                       c -= 0x80;
2106                       if (c < 0)
2107                         goto invalid_code;
2108                     }
2109                   else
2110                     c -= 0x20;
2111                   mseq_found = 1;
2112                 }
2113               else
2114                 {
2115                   *nbytes = src - src_base;
2116                   *nchars = consumed_chars;
2117                   return -c;
2118                 }
2119             }
2120           else
2121             goto invalid_code;
2122         }
2123
2124       switch (emacs_mule_bytes[c])
2125         {
2126         case 2:
2127           if (! (charset = emacs_mule_charset[c]))
2128             goto invalid_code;
2129           ONE_MORE_BYTE (c);
2130           if (c < 0xA0)
2131             goto invalid_code;
2132           code = c & 0x7F;
2133           break;
2134
2135         case 3:
2136           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2137               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2138             {
2139               ONE_MORE_BYTE (c);
2140               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2141                 goto invalid_code;
2142               ONE_MORE_BYTE (c);
2143               if (c < 0xA0)
2144                 goto invalid_code;
2145               code = c & 0x7F;
2146             }
2147           else
2148             {
2149               if (! (charset = emacs_mule_charset[c]))
2150                 goto invalid_code;
2151               ONE_MORE_BYTE (c);
2152               if (c < 0xA0)
2153                 goto invalid_code;
2154               code = (c & 0x7F) << 8;
2155               ONE_MORE_BYTE (c);
2156               if (c < 0xA0)
2157                 goto invalid_code;
2158               code |= c & 0x7F;
2159             }
2160           break;
2161
2162         case 4:
2163           ONE_MORE_BYTE (c);
2164           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2165             goto invalid_code;
2166           ONE_MORE_BYTE (c);
2167           if (c < 0xA0)
2168             goto invalid_code;
2169           code = (c & 0x7F) << 8;
2170           ONE_MORE_BYTE (c);
2171           if (c < 0xA0)
2172             goto invalid_code;
2173           code |= c & 0x7F;
2174           break;
2175
2176         case 1:
2177           code = c;
2178           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2179                                      ? charset_ascii : charset_eight_bit);
2180           break;
2181
2182         default:
2183           abort ();
2184         }
2185       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, code, c);
2186       if (c < 0)
2187         goto invalid_code;
2188     }
2189   *nbytes = src - src_base;
2190   *nchars = consumed_chars;
2191   if (id)
2192     *id = charset->id;
2193   return (mseq_found ? -c : c);
2194
2195  no_more_source:
2196   return -2;
2197
2198  invalid_code:
2199   return -1;
2200 }
2201
2202
2203 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2204
2205 /* Handle these composition sequence ('|': the end of header elements,
2206    BYTES and CHARS >= 0xA0):
2207
2208    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2209    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2210    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2211
2212    and these old form:
2213
2214    (4) relative composition: 0x80 | MSEQ ... MSEQ
2215    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2216
2217    When the starter 0x80 and the following header elements are found,
2218    this annotation header is produced.
2219
2220         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2221
2222    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2223    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2224
2225    Then, upon reading the following elements, these codes are produced
2226    until the composition end is found:
2227
2228    (1) CHAR ... CHAR
2229    (2) ALT ... ALT CHAR ... CHAR
2230    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2231    (4) CHAR ... CHAR
2232    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2233
2234    When the composition end is found, LENGTH and NCHARS in the
2235    annotation header is updated as below:
2236
2237    (1) LENGTH: unchanged, NCHARS: unchanged
2238    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2239    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2240    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2241    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2242
2243    If an error is found while composing, the annotation header is
2244    changed to the original composition header (plus filler -1s) as
2245    below:
2246
2247    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2248    (5)          [ 0x80 0xFF -1 -1- -1 ]
2249
2250    and the sequence [ -2 DECODED-RULE ] is changed to the original
2251    byte sequence as below:
2252         o the original byte sequence is B: [ B -1 ]
2253         o the original byte sequence is B1 B2: [ B1 B2 ]
2254
2255    Most of the routines are implemented by macros because many
2256    variables and labels in the caller decode_coding_emacs_mule must be
2257    accessible, and they are usually called just once (thus doesn't
2258    increase the size of compiled object).  */
2259
2260 /* Decode a composition rule represented by C as a component of
2261    composition sequence of Emacs 20 style.  Set RULE to the decoded
2262    rule. */
2263
2264 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2265   do {                                                  \
2266     int gref, nref;                                     \
2267                                                         \
2268     c -= 0xA0;                                          \
2269     if (c < 0 || c >= 81)                               \
2270       goto invalid_code;                                \
2271     gref = c / 9, nref = c % 9;                         \
2272     if (gref == 4) gref = 10;                           \
2273     if (nref == 4) nref = 10;                           \
2274     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2275   } while (0)
2276
2277
2278 /* Decode a composition rule represented by C and the following byte
2279    at SRC as a component of composition sequence of Emacs 21 style.
2280    Set RULE to the decoded rule.  */
2281
2282 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2283   do {                                                  \
2284     int gref, nref;                                     \
2285                                                         \
2286     gref = c - 0x20;                                    \
2287     if (gref < 0 || gref >= 81)                         \
2288       goto invalid_code;                                \
2289     ONE_MORE_BYTE (c);                                  \
2290     nref = c - 0x20;                                    \
2291     if (nref < 0 || nref >= 81)                         \
2292       goto invalid_code;                                \
2293     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2294   } while (0)
2295
2296
2297 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2298    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2299    byte length of this composition information, CHARS is the number of
2300    characters composed by this composition.  */
2301
2302 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2303   do {                                                                  \
2304     enum composition_method method = c - 0xF2;                          \
2305     int *charbuf_base = charbuf;                                        \
2306     int nbytes, nchars;                                                 \
2307                                                                         \
2308     ONE_MORE_BYTE (c);                                                  \
2309     if (c < 0)                                                          \
2310       goto invalid_code;                                                \
2311     nbytes = c - 0xA0;                                                  \
2312     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2313       goto invalid_code;                                                \
2314     ONE_MORE_BYTE (c);                                                  \
2315     nchars = c - 0xA0;                                                  \
2316     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2317       goto invalid_code;                                                \
2318     cmp_status->old_form = 0;                                           \
2319     cmp_status->method = method;                                        \
2320     if (method == COMPOSITION_RELATIVE)                                 \
2321       cmp_status->state = COMPOSING_CHAR;                               \
2322     else                                                                \
2323       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2325     cmp_status->nchars = nchars;                                        \
2326     cmp_status->ncomps = nbytes - 4;                                    \
2327     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2328   } while (0)
2329
2330
2331 /* Start of Emacs 20 style format for relative composition.  */
2332
2333 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2334   do {                                                          \
2335     cmp_status->old_form = 1;                                   \
2336     cmp_status->method = COMPOSITION_RELATIVE;                  \
2337     cmp_status->state = COMPOSING_CHAR;                         \
2338     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2339     cmp_status->nchars = cmp_status->ncomps = 0;                \
2340     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2341   } while (0)
2342
2343
2344 /* Start of Emacs 20 style format for rule-base composition.  */
2345
2346 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2347   do {                                                          \
2348     cmp_status->old_form = 1;                                   \
2349     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2350     cmp_status->state = COMPOSING_CHAR;                         \
2351     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2352     cmp_status->nchars = cmp_status->ncomps = 0;                \
2353     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2354   } while (0)
2355
2356
2357 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2358   do {                                                  \
2359     const unsigned char *current_src = src;             \
2360                                                         \
2361     ONE_MORE_BYTE (c);                                  \
2362     if (c < 0)                                          \
2363       goto invalid_code;                                \
2364     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2365         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2366       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2367     else if (c < 0xA0)                                  \
2368       goto invalid_code;                                \
2369     else if (c < 0xC0)                                  \
2370       {                                                 \
2371         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2372         /* Re-read C as a composition component.  */    \
2373         src = current_src;                              \
2374       }                                                 \
2375     else if (c == 0xFF)                                 \
2376       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2377     else                                                \
2378       goto invalid_code;                                \
2379   } while (0)
2380
2381 #define EMACS_MULE_COMPOSITION_END()                            \
2382   do {                                                          \
2383     int idx = - cmp_status->length;                             \
2384                                                                 \
2385     if (cmp_status->old_form)                                   \
2386       charbuf[idx + 2] = cmp_status->nchars;                    \
2387     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2388       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2389     cmp_status->state = COMPOSING_NO;                           \
2390   } while (0)
2391
2392
2393 static int
2394 emacs_mule_finish_composition (charbuf, cmp_status)
2395      int *charbuf;
2396      struct composition_status *cmp_status;
2397 {
2398   int idx = - cmp_status->length;
2399   int new_chars;
2400
2401   if (cmp_status->old_form && cmp_status->nchars > 0)
2402     {
2403       charbuf[idx + 2] = cmp_status->nchars;
2404       new_chars = 0;
2405       if (cmp_status->method == COMPOSITION_WITH_RULE
2406           && cmp_status->state == COMPOSING_CHAR)
2407         {
2408           /* The last rule was invalid.  */
2409           int rule = charbuf[-1] + 0xA0;
2410
2411           charbuf[-2] = BYTE8_TO_CHAR (rule);
2412           charbuf[-1] = -1;
2413           new_chars = 1;
2414         }
2415     }
2416   else
2417     {
2418       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2419
2420       if (cmp_status->method == COMPOSITION_WITH_RULE)
2421         {
2422           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2423           charbuf[idx++] = -3;
2424           charbuf[idx++] = 0;
2425           new_chars = 1;
2426         }
2427       else
2428         {
2429           int nchars = charbuf[idx + 1] + 0xA0;
2430           int nbytes = charbuf[idx + 2] + 0xA0;
2431
2432           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2433           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2434           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2435           charbuf[idx++] = -1;
2436           new_chars = 4;
2437         }
2438     }
2439   cmp_status->state = COMPOSING_NO;
2440   return new_chars;
2441 }
2442
2443 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2444   do {                                                                    \
2445     if (cmp_status->state != COMPOSING_NO)                                \
2446       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2447   } while (0)
2448
2449
2450 static void
2451 decode_coding_emacs_mule (coding)
2452      struct coding_system *coding;
2453 {
2454   const unsigned char *src = coding->source + coding->consumed;
2455   const unsigned char *src_end = coding->source + coding->src_bytes;
2456   const unsigned char *src_base;
2457   int *charbuf = coding->charbuf + coding->charbuf_used;
2458   /* We may produce two annocations (charset and composition) in one
2459      loop and one more charset annocation at the end.  */
2460   int *charbuf_end
2461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2462   int consumed_chars = 0, consumed_chars_base;
2463   int multibytep = coding->src_multibyte;
2464   Lisp_Object attrs, charset_list;
2465   int char_offset = coding->produced_char;
2466   int last_offset = char_offset;
2467   int last_id = charset_ascii;
2468   int eol_crlf =
2469     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2470   int byte_after_cr = -1;
2471   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2472
2473   CODING_GET_INFO (coding, attrs, charset_list);
2474
2475   if (cmp_status->state != COMPOSING_NO)
2476     {
2477       int i;
2478
2479       for (i = 0; i < cmp_status->length; i++)
2480         *charbuf++ = cmp_status->carryover[i];
2481       coding->annotated = 1;
2482     }
2483
2484   while (1)
2485     {
2486       int c, id;
2487
2488       src_base = src;
2489       consumed_chars_base = consumed_chars;
2490
2491       if (charbuf >= charbuf_end)
2492         {
2493           if (byte_after_cr >= 0)
2494             src_base--;
2495           break;
2496         }
2497
2498       if (byte_after_cr >= 0)
2499         c = byte_after_cr, byte_after_cr = -1;
2500       else
2501         ONE_MORE_BYTE (c);
2502
2503       if (c < 0 || c == 0x80)
2504         {
2505           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2506           if (c < 0)
2507             {
2508               *charbuf++ = -c;
2509               char_offset++;
2510             }
2511           else
2512             DECODE_EMACS_MULE_COMPOSITION_START ();
2513           continue;
2514         }
2515
2516       if (c < 0x80)
2517         {
2518           if (eol_crlf && c == '\r')
2519             ONE_MORE_BYTE (byte_after_cr);
2520           id = charset_ascii;
2521           if (cmp_status->state != COMPOSING_NO)
2522             {
2523               if (cmp_status->old_form)
2524                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2525               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2526                 cmp_status->ncomps--;
2527             }
2528         }
2529       else
2530         {
2531           int nchars, nbytes;
2532           /* emacs_mule_char can load a charset map from a file, which
2533              allocates a large structure and might cause buffer text
2534              to be relocated as result.  Thus, we need to remember the
2535              original pointer to buffer text, and fixup all related
2536              pointers after the call.  */
2537           const unsigned char *orig = coding->source;
2538           EMACS_INT offset;
2539
2540           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2541                                cmp_status);
2542           offset = coding->source - orig;
2543           if (offset)
2544             {
2545               src += offset;
2546               src_base += offset;
2547               src_end += offset;
2548             }
2549           if (c < 0)
2550             {
2551               if (c == -1)
2552                 goto invalid_code;
2553               if (c == -2)
2554                 break;
2555             }
2556           src = src_base + nbytes;
2557           consumed_chars = consumed_chars_base + nchars;
2558           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2559             cmp_status->ncomps -= nchars;
2560         }
2561
2562       /* Now if C >= 0, we found a normally encoded characer, if C <
2563          0, we found an old-style composition component character or
2564          rule.  */
2565
2566       if (cmp_status->state == COMPOSING_NO)
2567         {
2568           if (last_id != id)
2569             {
2570               if (last_id != charset_ascii)
2571                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2572                                   last_id);
2573               last_id = id;
2574               last_offset = char_offset;
2575             }
2576           *charbuf++ = c;
2577           char_offset++;
2578         }
2579       else if (cmp_status->state == COMPOSING_CHAR)
2580         {
2581           if (cmp_status->old_form)
2582             {
2583               if (c >= 0)
2584                 {
2585                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2586                   *charbuf++ = c;
2587                   char_offset++;
2588                 }
2589               else
2590                 {
2591                   *charbuf++ = -c;
2592                   cmp_status->nchars++;
2593                   cmp_status->length++;
2594                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2595                     EMACS_MULE_COMPOSITION_END ();
2596                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2597                     cmp_status->state = COMPOSING_RULE;
2598                 }
2599             }
2600           else
2601             {
2602               *charbuf++ = c;
2603               cmp_status->length++;
2604               cmp_status->nchars--;
2605               if (cmp_status->nchars == 0)
2606                 EMACS_MULE_COMPOSITION_END ();
2607             }
2608         }
2609       else if (cmp_status->state == COMPOSING_RULE)
2610         {
2611           int rule;
2612
2613           if (c >= 0)
2614             {
2615               EMACS_MULE_COMPOSITION_END ();
2616               *charbuf++ = c;
2617               char_offset++;
2618             }
2619           else
2620             {
2621               c = -c;
2622               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2623               if (rule < 0)
2624                 goto invalid_code;
2625               *charbuf++ = -2;
2626               *charbuf++ = rule;
2627               cmp_status->length += 2;
2628               cmp_status->state = COMPOSING_CHAR;
2629             }
2630         }
2631       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2632         {
2633           *charbuf++ = c;
2634           cmp_status->length++;
2635           if (cmp_status->ncomps == 0)
2636             cmp_status->state = COMPOSING_CHAR;
2637           else if (cmp_status->ncomps > 0)
2638             {
2639               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2640                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2641             }
2642           else
2643             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2644         }
2645       else                      /* COMPOSING_COMPONENT_RULE */
2646         {
2647           int rule;
2648
2649           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2650           if (rule < 0)
2651             goto invalid_code;
2652           *charbuf++ = -2;
2653           *charbuf++ = rule;
2654           cmp_status->length += 2;
2655           cmp_status->ncomps--;
2656           if (cmp_status->ncomps > 0)
2657             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2658           else
2659             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2660         }
2661       continue;
2662
2663     retry:
2664       src = src_base;
2665       consumed_chars = consumed_chars_base;
2666       continue;
2667
2668     invalid_code:
2669       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2670       src = src_base;
2671       consumed_chars = consumed_chars_base;
2672       ONE_MORE_BYTE (c);
2673       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2674       char_offset++;
2675       coding->errors++;
2676     }
2677
2678  no_more_source:
2679   if (cmp_status->state != COMPOSING_NO)
2680     {
2681       if (coding->mode & CODING_MODE_LAST_BLOCK)
2682         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2683       else
2684         {
2685           int i;
2686
2687           charbuf -= cmp_status->length;
2688           for (i = 0; i < cmp_status->length; i++)
2689             cmp_status->carryover[i] = charbuf[i];
2690         }
2691     }
2692   if (last_id != charset_ascii)
2693     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2694   coding->consumed_char += consumed_chars_base;
2695   coding->consumed = src_base - coding->source;
2696   coding->charbuf_used = charbuf - coding->charbuf;
2697 }
2698
2699
2700 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2701   do {                                          \
2702     if (id < 0xA0)                              \
2703       codes[0] = id, codes[1] = 0;              \
2704     else if (id < 0xE0)                         \
2705       codes[0] = 0x9A, codes[1] = id;           \
2706     else if (id < 0xF0)                         \
2707       codes[0] = 0x9B, codes[1] = id;           \
2708     else if (id < 0xF5)                         \
2709       codes[0] = 0x9C, codes[1] = id;           \
2710     else                                        \
2711       codes[0] = 0x9D, codes[1] = id;           \
2712   } while (0);
2713
2714
2715 static int
2716 encode_coding_emacs_mule (coding)
2717      struct coding_system *coding;
2718 {
2719   int multibytep = coding->dst_multibyte;
2720   int *charbuf = coding->charbuf;
2721   int *charbuf_end = charbuf + coding->charbuf_used;
2722   unsigned char *dst = coding->destination + coding->produced;
2723   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2724   int safe_room = 8;
2725   int produced_chars = 0;
2726   Lisp_Object attrs, charset_list;
2727   int c;
2728   int preferred_charset_id = -1;
2729
2730   CODING_GET_INFO (coding, attrs, charset_list);
2731   if (! EQ (charset_list, Vemacs_mule_charset_list))
2732     {
2733       CODING_ATTR_CHARSET_LIST (attrs)
2734         = charset_list = Vemacs_mule_charset_list;
2735     }
2736
2737   while (charbuf < charbuf_end)
2738     {
2739       ASSURE_DESTINATION (safe_room);
2740       c = *charbuf++;
2741
2742       if (c < 0)
2743         {
2744           /* Handle an annotation.  */
2745           switch (*charbuf)
2746             {
2747             case CODING_ANNOTATE_COMPOSITION_MASK:
2748               /* Not yet implemented.  */
2749               break;
2750             case CODING_ANNOTATE_CHARSET_MASK:
2751               preferred_charset_id = charbuf[3];
2752               if (preferred_charset_id >= 0
2753                   && NILP (Fmemq (make_number (preferred_charset_id),
2754                                   charset_list)))
2755                 preferred_charset_id = -1;
2756               break;
2757             default:
2758               abort ();
2759             }
2760           charbuf += -c - 1;
2761           continue;
2762         }
2763
2764       if (ASCII_CHAR_P (c))
2765         EMIT_ONE_ASCII_BYTE (c);
2766       else if (CHAR_BYTE8_P (c))
2767         {
2768           c = CHAR_TO_BYTE8 (c);
2769           EMIT_ONE_BYTE (c);
2770         }
2771       else
2772         {
2773           struct charset *charset;
2774           unsigned code;
2775           int dimension;
2776           int emacs_mule_id;
2777           unsigned char leading_codes[2];
2778
2779           if (preferred_charset_id >= 0)
2780             {
2781               charset = CHARSET_FROM_ID (preferred_charset_id);
2782               if (CHAR_CHARSET_P (c, charset))
2783                 code = ENCODE_CHAR (charset, c);
2784               else
2785                 charset = char_charset (c, charset_list, &code);
2786             }
2787           else
2788             charset = char_charset (c, charset_list, &code);
2789           if (! charset)
2790             {
2791               c = coding->default_char;
2792               if (ASCII_CHAR_P (c))
2793                 {
2794                   EMIT_ONE_ASCII_BYTE (c);
2795                   continue;
2796                 }
2797               charset = char_charset (c, charset_list, &code);
2798             }
2799           dimension = CHARSET_DIMENSION (charset);
2800           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2801           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2802           EMIT_ONE_BYTE (leading_codes[0]);
2803           if (leading_codes[1])
2804             EMIT_ONE_BYTE (leading_codes[1]);
2805           if (dimension == 1)
2806             EMIT_ONE_BYTE (code | 0x80);
2807           else
2808             {
2809               code |= 0x8080;
2810               EMIT_ONE_BYTE (code >> 8);
2811               EMIT_ONE_BYTE (code & 0xFF);
2812             }
2813         }
2814     }
2815   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2816   coding->produced_char += produced_chars;
2817   coding->produced = dst - coding->destination;
2818   return 0;
2819 }
2820
2821 \f
2822 /*** 7. ISO2022 handlers ***/
2823
2824 /* The following note describes the coding system ISO2022 briefly.
2825    Since the intention of this note is to help understand the
2826    functions in this file, some parts are NOT ACCURATE or are OVERLY
2827    SIMPLIFIED.  For thorough understanding, please refer to the
2828    original document of ISO2022.  This is equivalent to the standard
2829    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2830
2831    ISO2022 provides many mechanisms to encode several character sets
2832    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2833    is encoded using bytes less than 128.  This may make the encoded
2834    text a little bit longer, but the text passes more easily through
2835    several types of gateway, some of which strip off the MSB (Most
2836    Significant Bit).
2837
2838    There are two kinds of character sets: control character sets and
2839    graphic character sets.  The former contain control characters such
2840    as `newline' and `escape' to provide control functions (control
2841    functions are also provided by escape sequences).  The latter
2842    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2843    two control character sets and many graphic character sets.
2844
2845    Graphic character sets are classified into one of the following
2846    four classes, according to the number of bytes (DIMENSION) and
2847    number of characters in one dimension (CHARS) of the set:
2848    - DIMENSION1_CHARS94
2849    - DIMENSION1_CHARS96
2850    - DIMENSION2_CHARS94
2851    - DIMENSION2_CHARS96
2852
2853    In addition, each character set is assigned an identification tag,
2854    unique for each set, called the "final character" (denoted as <F>
2855    hereafter).  The <F> of each character set is decided by ECMA(*)
2856    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2857    (0x30..0x3F are for private use only).
2858
2859    Note (*): ECMA = European Computer Manufacturers Association
2860
2861    Here are examples of graphic character sets [NAME(<F>)]:
2862         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2863         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2864         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2865         o DIMENSION2_CHARS96 -- none for the moment
2866
2867    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2868         C0 [0x00..0x1F] -- control character plane 0
2869         GL [0x20..0x7F] -- graphic character plane 0
2870         C1 [0x80..0x9F] -- control character plane 1
2871         GR [0xA0..0xFF] -- graphic character plane 1
2872
2873    A control character set is directly designated and invoked to C0 or
2874    C1 by an escape sequence.  The most common case is that:
2875    - ISO646's  control character set is designated/invoked to C0, and
2876    - ISO6429's control character set is designated/invoked to C1,
2877    and usually these designations/invocations are omitted in encoded
2878    text.  In a 7-bit environment, only C0 can be used, and a control
2879    character for C1 is encoded by an appropriate escape sequence to
2880    fit into the environment.  All control characters for C1 are
2881    defined to have corresponding escape sequences.
2882
2883    A graphic character set is at first designated to one of four
2884    graphic registers (G0 through G3), then these graphic registers are
2885    invoked to GL or GR.  These designations and invocations can be
2886    done independently.  The most common case is that G0 is invoked to
2887    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2888    these invocations and designations are omitted in encoded text.
2889    In a 7-bit environment, only GL can be used.
2890
2891    When a graphic character set of CHARS94 is invoked to GL, codes
2892    0x20 and 0x7F of the GL area work as control characters SPACE and
2893    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2894    be used.
2895
2896    There are two ways of invocation: locking-shift and single-shift.
2897    With locking-shift, the invocation lasts until the next different
2898    invocation, whereas with single-shift, the invocation affects the
2899    following character only and doesn't affect the locking-shift
2900    state.  Invocations are done by the following control characters or
2901    escape sequences:
2902
2903    ----------------------------------------------------------------------
2904    abbrev  function                  cntrl escape seq   description
2905    ----------------------------------------------------------------------
2906    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2907    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2908    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2909    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2910    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2911    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2912    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2913    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2914    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2915    ----------------------------------------------------------------------
2916    (*) These are not used by any known coding system.
2917
2918    Control characters for these functions are defined by macros
2919    ISO_CODE_XXX in `coding.h'.
2920
2921    Designations are done by the following escape sequences:
2922    ----------------------------------------------------------------------
2923    escape sequence      description
2924    ----------------------------------------------------------------------
2925    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2926    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2927    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2928    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2929    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2930    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2931    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2932    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2933    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2934    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2935    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2936    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2937    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2938    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2939    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2940    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2941    ----------------------------------------------------------------------
2942
2943    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2944    of dimension 1, chars 94, and final character <F>, etc...
2945
2946    Note (*): Although these designations are not allowed in ISO2022,
2947    Emacs accepts them on decoding, and produces them on encoding
2948    CHARS96 character sets in a coding system which is characterized as
2949    7-bit environment, non-locking-shift, and non-single-shift.
2950
2951    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2952    '(' must be omitted.  We refer to this as "short-form" hereafter.
2953
2954    Now you may notice that there are a lot of ways of encoding the
2955    same multilingual text in ISO2022.  Actually, there exist many
2956    coding systems such as Compound Text (used in X11's inter client
2957    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2958    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2959    localized platforms), and all of these are variants of ISO2022.
2960
2961    In addition to the above, Emacs handles two more kinds of escape
2962    sequences: ISO6429's direction specification and Emacs' private
2963    sequence for specifying character composition.
2964
2965    ISO6429's direction specification takes the following form:
2966         o CSI ']'      -- end of the current direction
2967         o CSI '0' ']'  -- end of the current direction
2968         o CSI '1' ']'  -- start of left-to-right text
2969         o CSI '2' ']'  -- start of right-to-left text
2970    The control character CSI (0x9B: control sequence introducer) is
2971    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2972
2973    Character composition specification takes the following form:
2974         o ESC '0' -- start relative composition
2975         o ESC '1' -- end composition
2976         o ESC '2' -- start rule-base composition (*)
2977         o ESC '3' -- start relative composition with alternate chars  (**)
2978         o ESC '4' -- start rule-base composition with alternate chars  (**)
2979   Since these are not standard escape sequences of any ISO standard,
2980   the use of them with these meanings is restricted to Emacs only.
2981
2982   (*) This form is used only in Emacs 20.7 and older versions,
2983   but newer versions can safely decode it.
2984   (**) This form is used only in Emacs 21.1 and newer versions,
2985   and older versions can't decode it.
2986
2987   Here's a list of example usages of these composition escape
2988   sequences (categorized by `enum composition_method').
2989
2990   COMPOSITION_RELATIVE:
2991         ESC 0 CHAR [ CHAR ] ESC 1
2992   COMPOSITION_WITH_RULE:
2993         ESC 2 CHAR [ RULE CHAR ] ESC 1
2994   COMPOSITION_WITH_ALTCHARS:
2995         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2996   COMPOSITION_WITH_RULE_ALTCHARS:
2997         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2998
2999 enum iso_code_class_type iso_code_class[256];
3000
3001 #define SAFE_CHARSET_P(coding, id)      \
3002   ((id) <= (coding)->max_charset_id     \
3003    && (coding)->safe_charsets[id] != 255)
3004
3005
3006 #define SHIFT_OUT_OK(category)  \
3007   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
3008
3009 static void
3010 setup_iso_safe_charsets (attrs)
3011      Lisp_Object attrs;
3012 {
3013   Lisp_Object charset_list, safe_charsets;
3014   Lisp_Object request;
3015   Lisp_Object reg_usage;
3016   Lisp_Object tail;
3017   int reg94, reg96;
3018   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
3019   int max_charset_id;
3020
3021   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3022   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
3023       && ! EQ (charset_list, Viso_2022_charset_list))
3024     {
3025       CODING_ATTR_CHARSET_LIST (attrs)
3026         = charset_list = Viso_2022_charset_list;
3027       ASET (attrs, coding_attr_safe_charsets, Qnil);
3028     }
3029
3030   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3031     return;
3032
3033   max_charset_id = 0;
3034   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3035     {
3036       int id = XINT (XCAR (tail));
3037       if (max_charset_id < id)
3038         max_charset_id = id;
3039     }
3040
3041   safe_charsets = make_uninit_string (max_charset_id + 1);
3042   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3043   request = AREF (attrs, coding_attr_iso_request);
3044   reg_usage = AREF (attrs, coding_attr_iso_usage);
3045   reg94 = XINT (XCAR (reg_usage));
3046   reg96 = XINT (XCDR (reg_usage));
3047
3048   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3049     {
3050       Lisp_Object id;
3051       Lisp_Object reg;
3052       struct charset *charset;
3053
3054       id = XCAR (tail);
3055       charset = CHARSET_FROM_ID (XINT (id));
3056       reg = Fcdr (Fassq (id, request));
3057       if (! NILP (reg))
3058         SSET (safe_charsets, XINT (id), XINT (reg));
3059       else if (charset->iso_chars_96)
3060         {
3061           if (reg96 < 4)
3062             SSET (safe_charsets, XINT (id), reg96);
3063         }
3064       else
3065         {
3066           if (reg94 < 4)
3067             SSET (safe_charsets, XINT (id), reg94);
3068         }
3069     }
3070   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3071 }
3072
3073
3074 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3075    Check if a text is encoded in one of ISO-2022 based codig systems.
3076    If it is, return 1, else return 0.  */
3077
3078 static int
3079 detect_coding_iso_2022 (coding, detect_info)
3080      struct coding_system *coding;
3081      struct coding_detection_info *detect_info;
3082 {
3083   const unsigned char *src = coding->source, *src_base = src;
3084   const unsigned char *src_end = coding->source + coding->src_bytes;
3085   int multibytep = coding->src_multibyte;
3086   int single_shifting = 0;
3087   int id;
3088   int c, c1;
3089   int consumed_chars = 0;
3090   int i;
3091   int rejected = 0;
3092   int found = 0;
3093   int composition_count = -1;
3094
3095   detect_info->checked |= CATEGORY_MASK_ISO;
3096
3097   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3098     {
3099       struct coding_system *this = &(coding_categories[i]);
3100       Lisp_Object attrs, val;
3101
3102       if (this->id < 0)
3103         continue;
3104       attrs = CODING_ID_ATTRS (this->id);
3105       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3106           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3107         setup_iso_safe_charsets (attrs);
3108       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3109       this->max_charset_id = SCHARS (val) - 1;
3110       this->safe_charsets = SDATA (val);
3111     }
3112
3113   /* A coding system of this category is always ASCII compatible.  */
3114   src += coding->head_ascii;
3115
3116   while (rejected != CATEGORY_MASK_ISO)
3117     {
3118       src_base = src;
3119       ONE_MORE_BYTE (c);
3120       switch (c)
3121         {
3122         case ISO_CODE_ESC:
3123           if (inhibit_iso_escape_detection)
3124             break;
3125           single_shifting = 0;
3126           ONE_MORE_BYTE (c);
3127           if (c >= '(' && c <= '/')
3128             {
3129               /* Designation sequence for a charset of dimension 1.  */
3130               ONE_MORE_BYTE (c1);
3131               if (c1 < ' ' || c1 >= 0x80
3132                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3133                 /* Invalid designation sequence.  Just ignore.  */
3134                 break;
3135             }
3136           else if (c == '$')
3137             {
3138               /* Designation sequence for a charset of dimension 2.  */
3139               ONE_MORE_BYTE (c);
3140               if (c >= '@' && c <= 'B')
3141                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3142                 id = iso_charset_table[1][0][c];
3143               else if (c >= '(' && c <= '/')
3144                 {
3145                   ONE_MORE_BYTE (c1);
3146                   if (c1 < ' ' || c1 >= 0x80
3147                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3148                     /* Invalid designation sequence.  Just ignore.  */
3149                     break;
3150                 }
3151               else
3152                 /* Invalid designation sequence.  Just ignore it.  */
3153                 break;
3154             }
3155           else if (c == 'N' || c == 'O')
3156             {
3157               /* ESC <Fe> for SS2 or SS3.  */
3158               single_shifting = 1;
3159               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160               break;
3161             }
3162           else if (c == '1')
3163             {
3164               /* End of composition.  */
3165               if (composition_count < 0
3166                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3167                 /* Invalid */
3168                 break;
3169               composition_count = -1;
3170               found |= CATEGORY_MASK_ISO;
3171             }
3172           else if (c >= '0' && c <= '4')
3173             {
3174               /* ESC <Fp> for start/end composition.  */
3175               composition_count = 0;
3176               break;
3177             }
3178           else
3179             {
3180               /* Invalid escape sequence.  Just ignore it.  */
3181               break;
3182             }
3183
3184           /* We found a valid designation sequence for CHARSET.  */
3185           rejected |= CATEGORY_MASK_ISO_8BIT;
3186           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3187                               id))
3188             found |= CATEGORY_MASK_ISO_7;
3189           else
3190             rejected |= CATEGORY_MASK_ISO_7;
3191           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3192                               id))
3193             found |= CATEGORY_MASK_ISO_7_TIGHT;
3194           else
3195             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3196           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3197                               id))
3198             found |= CATEGORY_MASK_ISO_7_ELSE;
3199           else
3200             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3201           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3202                               id))
3203             found |= CATEGORY_MASK_ISO_8_ELSE;
3204           else
3205             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3206           break;
3207
3208         case ISO_CODE_SO:
3209         case ISO_CODE_SI:
3210           /* Locking shift out/in.  */
3211           if (inhibit_iso_escape_detection)
3212             break;
3213           single_shifting = 0;
3214           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3215           break;
3216
3217         case ISO_CODE_CSI:
3218           /* Control sequence introducer.  */
3219           single_shifting = 0;
3220           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3221           found |= CATEGORY_MASK_ISO_8_ELSE;
3222           goto check_extra_latin;
3223
3224         case ISO_CODE_SS2:
3225         case ISO_CODE_SS3:
3226           /* Single shift.   */
3227           if (inhibit_iso_escape_detection)
3228             break;
3229           single_shifting = 0;
3230           rejected |= CATEGORY_MASK_ISO_7BIT;
3231           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3232               & CODING_ISO_FLAG_SINGLE_SHIFT)
3233             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3234           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3235               & CODING_ISO_FLAG_SINGLE_SHIFT)
3236             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3237           if (single_shifting)
3238             break;
3239           goto check_extra_latin;
3240
3241         default:
3242           if (c < 0)
3243             continue;
3244           if (c < 0x80)
3245             {
3246               if (composition_count >= 0)
3247                 composition_count++;
3248               single_shifting = 0;
3249               break;
3250             }
3251           if (c >= 0xA0)
3252             {
3253               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3254               found |= CATEGORY_MASK_ISO_8_1;
3255               /* Check the length of succeeding codes of the range
3256                  0xA0..0FF.  If the byte length is even, we include
3257                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3258                  only when we are not single shifting.  */
3259               if (! single_shifting
3260                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3261                 {
3262                   int i = 1;
3263                   while (src < src_end)
3264                     {
3265                       src_base = src;
3266                       ONE_MORE_BYTE (c);
3267                       if (c < 0xA0)
3268                         {
3269                           src = src_base;
3270                           break;
3271                         }
3272                       i++;
3273                     }
3274
3275                   if (i & 1 && src < src_end)
3276                     {
3277                       rejected |= CATEGORY_MASK_ISO_8_2;
3278                       if (composition_count >= 0)
3279                         composition_count += i;
3280                     }
3281                   else
3282                     {
3283                       found |= CATEGORY_MASK_ISO_8_2;
3284                       if (composition_count >= 0)
3285                         composition_count += i / 2;
3286                     }
3287                 }
3288               break;
3289             }
3290         check_extra_latin:
3291           single_shifting = 0;
3292           if (! VECTORP (Vlatin_extra_code_table)
3293               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3294             {
3295               rejected = CATEGORY_MASK_ISO;
3296               break;
3297             }
3298           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3299               & CODING_ISO_FLAG_LATIN_EXTRA)
3300             found |= CATEGORY_MASK_ISO_8_1;
3301           else
3302             rejected |= CATEGORY_MASK_ISO_8_1;
3303           rejected |= CATEGORY_MASK_ISO_8_2;
3304         }
3305     }
3306   detect_info->rejected |= CATEGORY_MASK_ISO;
3307   return 0;
3308
3309  no_more_source:
3310   detect_info->rejected |= rejected;
3311   detect_info->found |= (found & ~rejected);
3312   return 1;
3313 }
3314
3315
3316 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3317    escape sequence should be kept.  */
3318 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3319   do {                                                                  \
3320     int id, prev;                                                       \
3321                                                                         \
3322     if (final < '0' || final >= 128                                     \
3323         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3324         || !SAFE_CHARSET_P (coding, id))                                \
3325       {                                                                 \
3326         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3327         chars_96 = -1;                                                  \
3328         break;                                                          \
3329       }                                                                 \
3330     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3331     if (id == charset_jisx0201_roman)                                   \
3332       {                                                                 \
3333         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3334           id = charset_ascii;                                           \
3335       }                                                                 \
3336     else if (id == charset_jisx0208_1978)                               \
3337       {                                                                 \
3338         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3339           id = charset_jisx0208;                                        \
3340       }                                                                 \
3341     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3342     /* If there was an invalid designation to REG previously, and this  \
3343        designation is ASCII to REG, we should keep this designation     \
3344        sequence.  */                                                    \
3345     if (prev == -2 && id == charset_ascii)                              \
3346       chars_96 = -1;                                                    \
3347   } while (0)
3348
3349
3350 /* Handle these composition sequence (ALT: alternate char):
3351
3352    (1) relative composition: ESC 0 CHAR ... ESC 1
3353    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3354    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3355    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3356
3357    When the start sequence (ESC 0/2/3/4) is found, this annotation
3358    header is produced.
3359
3360         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3361
3362    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3363    produced until the end sequence (ESC 1) is found:
3364
3365    (1) CHAR ... CHAR
3366    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3367    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3368    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3369
3370    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3371    annotation header is updated as below:
3372
3373    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3374    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3375    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3376    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3377
3378    If an error is found while composing, the annotation header is
3379    changed to:
3380
3381         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3382
3383    and the sequence [ -2 DECODED-RULE ] is changed to the original
3384    byte sequence as below:
3385         o the original byte sequence is B: [ B -1 ]
3386         o the original byte sequence is B1 B2: [ B1 B2 ]
3387    and the sequence [ -1 -1 ] is changed to the original byte
3388    sequence:
3389         [ ESC '0' ]
3390 */
3391
3392 /* Decode a composition rule C1 and maybe one more byte from the
3393    source, and set RULE to the encoded composition rule, NBYTES to the
3394    length of the composition rule.  If the rule is invalid, set RULE
3395    to some negative value.  */
3396
3397 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3398   do {                                                                  \
3399     rule = c1 - 32;                                                     \
3400     if (rule < 0)                                                       \
3401       break;                                                            \
3402     if (rule < 81)              /* old format (before ver.21) */        \
3403       {                                                                 \
3404         int gref = (rule) / 9;                                          \
3405         int nref = (rule) % 9;                                          \
3406         if (gref == 4) gref = 10;                                       \
3407         if (nref == 4) nref = 10;                                       \
3408         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3409         nbytes = 1;                                                     \
3410       }                                                                 \
3411     else                        /* new format (after ver.21) */         \
3412       {                                                                 \
3413         int c;                                                          \
3414                                                                         \
3415         ONE_MORE_BYTE (c);                                              \
3416         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3417         if (rule >= 0)                                                  \
3418           rule += 0x100;   /* to destinguish it from the old format */  \
3419         nbytes = 2;                                                     \
3420       }                                                                 \
3421   } while (0)
3422
3423 #define ENCODE_COMPOSITION_RULE(rule)                           \
3424   do {                                                          \
3425     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3426                                                                 \
3427     if (rule < 0x100)           /* old format */                \
3428       {                                                         \
3429         if (gref == 10) gref = 4;                               \
3430         if (nref == 10) nref = 4;                               \
3431         charbuf[idx] = 32 + gref * 9 + nref;                    \
3432         charbuf[idx + 1] = -1;                                  \
3433         new_chars++;                                            \
3434       }                                                         \
3435     else                                /* new format */        \
3436       {                                                         \
3437         charbuf[idx] = 32 + 81 + gref;                          \
3438         charbuf[idx + 1] = 32 + nref;                           \
3439         new_chars += 2;                                         \
3440       }                                                         \
3441   } while (0)
3442
3443 /* Finish the current composition as invalid.  */
3444
3445 static int finish_composition P_ ((int *, struct composition_status *));
3446
3447 static int
3448 finish_composition (charbuf, cmp_status)
3449      int *charbuf;
3450      struct composition_status *cmp_status;
3451 {
3452   int idx = - cmp_status->length;
3453   int new_chars;
3454
3455   /* Recover the original ESC sequence */
3456   charbuf[idx++] = ISO_CODE_ESC;
3457   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3458                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3459                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3460                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3461                     : '4');
3462   charbuf[idx++] = -2;
3463   charbuf[idx++] = 0;
3464   charbuf[idx++] = -1;
3465   new_chars = cmp_status->nchars;
3466   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3467     for (; idx < 0; idx++)
3468       {
3469         int elt = charbuf[idx];
3470
3471         if (elt == -2)
3472           {
3473             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3474             idx++;
3475           }
3476         else if (elt == -1)
3477           {
3478             charbuf[idx++] = ISO_CODE_ESC;
3479             charbuf[idx] = '0';
3480             new_chars += 2;
3481           }
3482       }
3483   cmp_status->state = COMPOSING_NO;
3484   return new_chars;
3485 }
3486
3487 /* If characers are under composition, finish the composition.  */
3488 #define MAYBE_FINISH_COMPOSITION()                              \
3489   do {                                                          \
3490     if (cmp_status->state != COMPOSING_NO)                      \
3491       char_offset += finish_composition (charbuf, cmp_status);  \
3492   } while (0)
3493
3494 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3495
3496    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3497    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3498    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3499    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3500
3501    Produce this annotation sequence now:
3502
3503    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3504 */
3505
3506 #define DECODE_COMPOSITION_START(c1)                                       \
3507   do {                                                                     \
3508     if (c1 == '0'                                                          \
3509         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3510              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3511             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3512                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3513       {                                                                    \
3514         *charbuf++ = -1;                                                   \
3515         *charbuf++= -1;                                                    \
3516         cmp_status->state = COMPOSING_CHAR;                                \
3517         cmp_status->length += 2;                                           \
3518       }                                                                    \
3519     else                                                                   \
3520       {                                                                    \
3521         MAYBE_FINISH_COMPOSITION ();                                       \
3522         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3523                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3524                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3525                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3526         cmp_status->state                                                  \
3527           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3528         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3529         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3530         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3531         coding->annotated = 1;                                             \
3532       }                                                                    \
3533   } while (0)
3534
3535
3536 /* Handle composition end sequence ESC 1.  */
3537
3538 #define DECODE_COMPOSITION_END()                                        \
3539   do {                                                                  \
3540     if (cmp_status->nchars == 0                                         \
3541         || ((cmp_status->state == COMPOSING_CHAR)                       \
3542             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3543       {                                                                 \
3544         MAYBE_FINISH_COMPOSITION ();                                    \
3545         goto invalid_code;                                              \
3546       }                                                                 \
3547     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3548       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3549     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3550       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3551     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3552     char_offset += cmp_status->nchars;                                  \
3553     cmp_status->state = COMPOSING_NO;                                   \
3554   } while (0)
3555
3556 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3557
3558 #define STORE_COMPOSITION_RULE(rule)    \
3559   do {                                  \
3560     *charbuf++ = -2;                    \
3561     *charbuf++ = rule;                  \
3562     cmp_status->length += 2;            \
3563     cmp_status->state--;                \
3564   } while (0)
3565
3566 /* Store a composed char or a component char C in charbuf, and update
3567    cmp_status.  */
3568
3569 #define STORE_COMPOSITION_CHAR(c)                                       \
3570   do {                                                                  \
3571     *charbuf++ = (c);                                                   \
3572     cmp_status->length++;                                               \
3573     if (cmp_status->state == COMPOSING_CHAR)                            \
3574       cmp_status->nchars++;                                             \
3575     else                                                                \
3576       cmp_status->ncomps++;                                             \
3577     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3578         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3579             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3580       cmp_status->state++;                                              \
3581   } while (0)
3582
3583
3584 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3585
3586 static void
3587 decode_coding_iso_2022 (coding)
3588      struct coding_system *coding;
3589 {
3590   const unsigned char *src = coding->source + coding->consumed;
3591   const unsigned char *src_end = coding->source + coding->src_bytes;
3592   const unsigned char *src_base;
3593   int *charbuf = coding->charbuf + coding->charbuf_used;
3594   /* We may produce two annocations (charset and composition) in one
3595      loop and one more charset annocation at the end.  */
3596   int *charbuf_end
3597     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3598   int consumed_chars = 0, consumed_chars_base;
3599   int multibytep = coding->src_multibyte;
3600   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3601   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3602   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3603   int charset_id_2, charset_id_3;
3604   struct charset *charset;
3605   int c;
3606   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3607   Lisp_Object attrs, charset_list;
3608   int char_offset = coding->produced_char;
3609   int last_offset = char_offset;
3610   int last_id = charset_ascii;
3611   int eol_crlf =
3612     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3613   int byte_after_cr = -1;
3614   int i;
3615
3616   CODING_GET_INFO (coding, attrs, charset_list);
3617   setup_iso_safe_charsets (attrs);
3618   /* Charset list may have been changed.  */
3619   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3620   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3621
3622   if (cmp_status->state != COMPOSING_NO)
3623     {
3624       for (i = 0; i < cmp_status->length; i++)
3625         *charbuf++ = cmp_status->carryover[i];
3626       coding->annotated = 1;
3627     }
3628
3629   while (1)
3630     {
3631       int c1, c2, c3;
3632
3633       src_base = src;
3634       consumed_chars_base = consumed_chars;
3635
3636       if (charbuf >= charbuf_end)
3637         {
3638           if (byte_after_cr >= 0)
3639             src_base--;
3640           break;
3641         }
3642
3643       if (byte_after_cr >= 0)
3644         c1 = byte_after_cr, byte_after_cr = -1;
3645       else
3646         ONE_MORE_BYTE (c1);
3647       if (c1 < 0)
3648         goto invalid_code;
3649
3650       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3651         {
3652           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3653           char_offset++;
3654           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3655           continue;
3656         }
3657
3658       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3659         {
3660           if (c1 == ISO_CODE_ESC)
3661             {
3662               if (src + 1 >= src_end)
3663                 goto no_more_source;
3664               *charbuf++ = ISO_CODE_ESC;
3665               char_offset++;
3666               if (src[0] == '%' && src[1] == '@')
3667                 {
3668                   src += 2;
3669                   consumed_chars += 2;
3670                   char_offset += 2;
3671                   /* We are sure charbuf can contain two more chars. */
3672                   *charbuf++ = '%';
3673                   *charbuf++ = '@';
3674                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3675                 }
3676             }
3677           else
3678             {
3679               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3680               char_offset++;
3681             }
3682           continue;
3683         }
3684
3685       if ((cmp_status->state == COMPOSING_RULE
3686            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3687           && c1 != ISO_CODE_ESC)
3688         {
3689           int rule, nbytes;
3690
3691           DECODE_COMPOSITION_RULE (rule, nbytes);
3692           if (rule < 0)
3693             goto invalid_code;
3694           STORE_COMPOSITION_RULE (rule);
3695           continue;
3696         }
3697
3698       /* We produce at most one character.  */
3699       switch (iso_code_class [c1])
3700         {
3701         case ISO_0x20_or_0x7F:
3702           if (charset_id_0 < 0
3703               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3704             /* This is SPACE or DEL.  */
3705             charset = CHARSET_FROM_ID (charset_ascii);
3706           else
3707             charset = CHARSET_FROM_ID (charset_id_0);
3708           break;
3709
3710         case ISO_graphic_plane_0:
3711           if (charset_id_0 < 0)
3712             charset = CHARSET_FROM_ID (charset_ascii);
3713           else
3714             charset = CHARSET_FROM_ID (charset_id_0);
3715           break;
3716
3717         case ISO_0xA0_or_0xFF:
3718           if (charset_id_1 < 0
3719               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3720               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3721             goto invalid_code;
3722           /* This is a graphic character, we fall down ... */
3723
3724         case ISO_graphic_plane_1:
3725           if (charset_id_1 < 0)
3726             goto invalid_code;
3727           charset = CHARSET_FROM_ID (charset_id_1);
3728           break;
3729
3730         case ISO_control_0:
3731           if (eol_crlf && c1 == '\r')
3732             ONE_MORE_BYTE (byte_after_cr);
3733           MAYBE_FINISH_COMPOSITION ();
3734           charset = CHARSET_FROM_ID (charset_ascii);
3735           break;
3736
3737         case ISO_control_1:
3738           goto invalid_code;
3739
3740         case ISO_shift_out:
3741           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3742               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3743             goto invalid_code;
3744           CODING_ISO_INVOCATION (coding, 0) = 1;
3745           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3746           continue;
3747
3748         case ISO_shift_in:
3749           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3750             goto invalid_code;
3751           CODING_ISO_INVOCATION (coding, 0) = 0;
3752           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3753           continue;
3754
3755         case ISO_single_shift_2_7:
3756           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3757             goto invalid_code;
3758         case ISO_single_shift_2:
3759           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3760             goto invalid_code;
3761           /* SS2 is handled as an escape sequence of ESC 'N' */
3762           c1 = 'N';
3763           goto label_escape_sequence;
3764
3765         case ISO_single_shift_3:
3766           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3767             goto invalid_code;
3768           /* SS2 is handled as an escape sequence of ESC 'O' */
3769           c1 = 'O';
3770           goto label_escape_sequence;
3771
3772         case ISO_control_sequence_introducer:
3773           /* CSI is handled as an escape sequence of ESC '[' ...  */
3774           c1 = '[';
3775           goto label_escape_sequence;
3776
3777         case ISO_escape:
3778           ONE_MORE_BYTE (c1);
3779         label_escape_sequence:
3780           /* Escape sequences handled here are invocation,
3781              designation, direction specification, and character
3782              composition specification.  */
3783           switch (c1)
3784             {
3785             case '&':           /* revision of following character set */
3786               ONE_MORE_BYTE (c1);
3787               if (!(c1 >= '@' && c1 <= '~'))
3788                 goto invalid_code;
3789               ONE_MORE_BYTE (c1);
3790               if (c1 != ISO_CODE_ESC)
3791                 goto invalid_code;
3792               ONE_MORE_BYTE (c1);
3793               goto label_escape_sequence;
3794
3795             case '$':           /* designation of 2-byte character set */
3796               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3797                 goto invalid_code;
3798               {
3799                 int reg, chars96;
3800
3801                 ONE_MORE_BYTE (c1);
3802                 if (c1 >= '@' && c1 <= 'B')
3803                   {     /* designation of JISX0208.1978, GB2312.1980,
3804                            or JISX0208.1980 */
3805                     reg = 0, chars96 = 0;
3806                   }
3807                 else if (c1 >= 0x28 && c1 <= 0x2B)
3808                   { /* designation of DIMENSION2_CHARS94 character set */
3809                     reg = c1 - 0x28, chars96 = 0;
3810                     ONE_MORE_BYTE (c1);
3811                   }
3812                 else if (c1 >= 0x2C && c1 <= 0x2F)
3813                   { /* designation of DIMENSION2_CHARS96 character set */
3814                     reg = c1 - 0x2C, chars96 = 1;
3815                     ONE_MORE_BYTE (c1);
3816                   }
3817                 else
3818                   goto invalid_code;
3819                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3820                 /* We must update these variables now.  */
3821                 if (reg == 0)
3822                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3823                 else if (reg == 1)
3824                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3825                 if (chars96 < 0)
3826                   goto invalid_code;
3827               }
3828               continue;
3829
3830             case 'n':           /* invocation of locking-shift-2 */
3831               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3832                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3833                 goto invalid_code;
3834               CODING_ISO_INVOCATION (coding, 0) = 2;
3835               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3836               continue;
3837
3838             case 'o':           /* invocation of locking-shift-3 */
3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3840                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3841                 goto invalid_code;
3842               CODING_ISO_INVOCATION (coding, 0) = 3;
3843               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3844               continue;
3845
3846             case 'N':           /* invocation of single-shift-2 */
3847               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3848                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3849                 goto invalid_code;
3850               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3851               if (charset_id_2 < 0)
3852                 charset = CHARSET_FROM_ID (charset_ascii);
3853               else
3854                 charset = CHARSET_FROM_ID (charset_id_2);
3855               ONE_MORE_BYTE (c1);
3856               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3857                 goto invalid_code;
3858               break;
3859
3860             case 'O':           /* invocation of single-shift-3 */
3861               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3862                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3863                 goto invalid_code;
3864               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3865               if (charset_id_3 < 0)
3866                 charset = CHARSET_FROM_ID (charset_ascii);
3867               else
3868                 charset = CHARSET_FROM_ID (charset_id_3);
3869               ONE_MORE_BYTE (c1);
3870               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3871                 goto invalid_code;
3872               break;
3873
3874             case '0': case '2': case '3': case '4': /* start composition */
3875               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3876                 goto invalid_code;
3877               if (last_id != charset_ascii)
3878                 {
3879                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3880                   last_id = charset_ascii;
3881                   last_offset = char_offset;
3882                 }
3883               DECODE_COMPOSITION_START (c1);
3884               continue;
3885
3886             case '1':           /* end composition */
3887               if (cmp_status->state == COMPOSING_NO)
3888                 goto invalid_code;
3889               DECODE_COMPOSITION_END ();
3890               continue;
3891
3892             case '[':           /* specification of direction */
3893               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3894                 goto invalid_code;
3895               /* For the moment, nested direction is not supported.
3896                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3897                  left-to-right, and nozero means right-to-left.  */
3898               ONE_MORE_BYTE (c1);
3899               switch (c1)
3900                 {
3901                 case ']':       /* end of the current direction */
3902                   coding->mode &= ~CODING_MODE_DIRECTION;
3903
3904                 case '0':       /* end of the current direction */
3905                 case '1':       /* start of left-to-right direction */
3906                   ONE_MORE_BYTE (c1);
3907                   if (c1 == ']')
3908                     coding->mode &= ~CODING_MODE_DIRECTION;
3909                   else
3910                     goto invalid_code;
3911                   break;
3912
3913                 case '2':       /* start of right-to-left direction */
3914                   ONE_MORE_BYTE (c1);
3915                   if (c1 == ']')
3916                     coding->mode |= CODING_MODE_DIRECTION;
3917                   else
3918                     goto invalid_code;
3919                   break;
3920
3921                 default:
3922                   goto invalid_code;
3923                 }
3924               continue;
3925
3926             case '%':
3927               ONE_MORE_BYTE (c1);
3928               if (c1 == '/')
3929                 {
3930                   /* CTEXT extended segment:
3931                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3932                      We keep these bytes as is for the moment.
3933                      They may be decoded by post-read-conversion.  */
3934                   int dim, M, L;
3935                   int size;
3936
3937                   ONE_MORE_BYTE (dim);
3938                   if (dim < '0' || dim > '4')
3939                     goto invalid_code;
3940                   ONE_MORE_BYTE (M);
3941                   if (M < 128)
3942                     goto invalid_code;
3943                   ONE_MORE_BYTE (L);
3944                   if (L < 128)
3945                     goto invalid_code;
3946                   size = ((M - 128) * 128) + (L - 128);
3947                   if (charbuf + 6 > charbuf_end)
3948                     goto break_loop;
3949                   *charbuf++ = ISO_CODE_ESC;
3950                   *charbuf++ = '%';
3951                   *charbuf++ = '/';
3952                   *charbuf++ = dim;
3953                   *charbuf++ = BYTE8_TO_CHAR (M);
3954                   *charbuf++ = BYTE8_TO_CHAR (L);
3955                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3956                 }
3957               else if (c1 == 'G')
3958                 {
3959                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3960                      ESC % G --UTF-8-BYTES-- ESC % @
3961                      We keep these bytes as is for the moment.
3962                      They may be decoded by post-read-conversion.  */
3963                   if (charbuf + 3 > charbuf_end)
3964                     goto break_loop;
3965                   *charbuf++ = ISO_CODE_ESC;
3966                   *charbuf++ = '%';
3967                   *charbuf++ = 'G';
3968                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3969                 }
3970               else
3971                 goto invalid_code;
3972               continue;
3973               break;
3974
3975             default:
3976               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3977                 goto invalid_code;
3978               {
3979                 int reg, chars96;
3980
3981                 if (c1 >= 0x28 && c1 <= 0x2B)
3982                   { /* designation of DIMENSION1_CHARS94 character set */
3983                     reg = c1 - 0x28, chars96 = 0;
3984                     ONE_MORE_BYTE (c1);
3985                   }
3986                 else if (c1 >= 0x2C && c1 <= 0x2F)
3987                   { /* designation of DIMENSION1_CHARS96 character set */
3988                     reg = c1 - 0x2C, chars96 = 1;
3989                     ONE_MORE_BYTE (c1);
3990                   }
3991                 else
3992                   goto invalid_code;
3993                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3994                 /* We must update these variables now.  */
3995                 if (reg == 0)
3996                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3997                 else if (reg == 1)
3998                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3999                 if (chars96 < 0)
4000                   goto invalid_code;
4001               }
4002               continue;
4003             }
4004         }
4005
4006       if (cmp_status->state == COMPOSING_NO
4007           && charset->id != charset_ascii
4008           && last_id != charset->id)
4009         {
4010           if (last_id != charset_ascii)
4011             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4012           last_id = charset->id;
4013           last_offset = char_offset;
4014         }
4015
4016       /* Now we know CHARSET and 1st position code C1 of a character.
4017          Produce a decoded character while getting 2nd and 3rd
4018          position codes C2, C3 if necessary.  */
4019       if (CHARSET_DIMENSION (charset) > 1)
4020         {
4021           ONE_MORE_BYTE (c2);
4022           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
4023               || ((c1 & 0x80) != (c2 & 0x80)))
4024             /* C2 is not in a valid range.  */
4025             goto invalid_code;
4026           if (CHARSET_DIMENSION (charset) == 2)
4027             c1 = (c1 << 8) | c2;
4028           else
4029             {
4030               ONE_MORE_BYTE (c3);
4031               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4032                   || ((c1 & 0x80) != (c3 & 0x80)))
4033                 /* C3 is not in a valid range.  */
4034                 goto invalid_code;
4035               c1 = (c1 << 16) | (c2 << 8) | c2;
4036             }
4037         }
4038       c1 &= 0x7F7F7F;
4039       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4040       if (c < 0)
4041         {
4042           MAYBE_FINISH_COMPOSITION ();
4043           for (; src_base < src; src_base++, char_offset++)
4044             {
4045               if (ASCII_BYTE_P (*src_base))
4046                 *charbuf++ = *src_base;
4047               else
4048                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4049             }
4050         }
4051       else if (cmp_status->state == COMPOSING_NO)
4052         {
4053           *charbuf++ = c;
4054           char_offset++;
4055         }
4056       else if ((cmp_status->state == COMPOSING_CHAR
4057                 ? cmp_status->nchars
4058                 : cmp_status->ncomps)
4059                >= MAX_COMPOSITION_COMPONENTS)
4060         {
4061           /* Too long composition.  */
4062           MAYBE_FINISH_COMPOSITION ();
4063           *charbuf++ = c;
4064           char_offset++;
4065         }
4066       else
4067         STORE_COMPOSITION_CHAR (c);
4068       continue;
4069
4070     invalid_code:
4071       MAYBE_FINISH_COMPOSITION ();
4072       src = src_base;
4073       consumed_chars = consumed_chars_base;
4074       ONE_MORE_BYTE (c);
4075       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4076       char_offset++;
4077       coding->errors++;
4078       continue;
4079
4080     break_loop:
4081       break;
4082     }
4083
4084  no_more_source:
4085   if (cmp_status->state != COMPOSING_NO)
4086     {
4087       if (coding->mode & CODING_MODE_LAST_BLOCK)
4088         MAYBE_FINISH_COMPOSITION ();
4089       else
4090         {
4091           charbuf -= cmp_status->length;
4092           for (i = 0; i < cmp_status->length; i++)
4093             cmp_status->carryover[i] = charbuf[i];
4094         }
4095     }
4096   else if (last_id != charset_ascii)
4097     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4098   coding->consumed_char += consumed_chars_base;
4099   coding->consumed = src_base - coding->source;
4100   coding->charbuf_used = charbuf - coding->charbuf;
4101 }
4102
4103
4104 /* ISO2022 encoding stuff.  */
4105
4106 /*
4107    It is not enough to say just "ISO2022" on encoding, we have to
4108    specify more details.  In Emacs, each coding system of ISO2022
4109    variant has the following specifications:
4110         1. Initial designation to G0 thru G3.
4111         2. Allows short-form designation?
4112         3. ASCII should be designated to G0 before control characters?
4113         4. ASCII should be designated to G0 at end of line?
4114         5. 7-bit environment or 8-bit environment?
4115         6. Use locking-shift?
4116         7. Use Single-shift?
4117    And the following two are only for Japanese:
4118         8. Use ASCII in place of JIS0201-1976-Roman?
4119         9. Use JISX0208-1983 in place of JISX0208-1978?
4120    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4121    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4122    details.
4123 */
4124
4125 /* Produce codes (escape sequence) for designating CHARSET to graphic
4126    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4127    '@', 'A', or 'B' and the coding system CODING allows, produce
4128    designation sequence of short-form.  */
4129
4130 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4131   do {                                                                  \
4132     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4133     char *intermediate_char_94 = "()*+";                                \
4134     char *intermediate_char_96 = ",-./";                                \
4135     int revision = -1;                                                  \
4136     int c;                                                              \
4137                                                                         \
4138     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4139       revision = CHARSET_ISO_REVISION (charset);                        \
4140                                                                         \
4141     if (revision >= 0)                                                  \
4142       {                                                                 \
4143         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4144         EMIT_ONE_BYTE ('@' + revision);                                 \
4145       }                                                                 \
4146     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4147     if (CHARSET_DIMENSION (charset) == 1)                               \
4148       {                                                                 \
4149         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4150           c = intermediate_char_94[reg];                                \
4151         else                                                            \
4152           c = intermediate_char_96[reg];                                \
4153         EMIT_ONE_ASCII_BYTE (c);                                        \
4154       }                                                                 \
4155     else                                                                \
4156       {                                                                 \
4157         EMIT_ONE_ASCII_BYTE ('$');                                      \
4158         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4159           {                                                             \
4160             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4161                 || reg != 0                                             \
4162                 || final_char < '@' || final_char > 'B')                \
4163               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4164           }                                                             \
4165         else                                                            \
4166           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4167       }                                                                 \
4168     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4169                                                                         \
4170     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4171   } while (0)
4172
4173
4174 /* The following two macros produce codes (control character or escape
4175    sequence) for ISO2022 single-shift functions (single-shift-2 and
4176    single-shift-3).  */
4177
4178 #define ENCODE_SINGLE_SHIFT_2                                           \
4179   do {                                                                  \
4180     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4181       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4182     else                                                                \
4183       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4184     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4185   } while (0)
4186
4187
4188 #define ENCODE_SINGLE_SHIFT_3                                           \
4189   do {                                                                  \
4190     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4191       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4192     else                                                                \
4193       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4194     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4195   } while (0)
4196
4197
4198 /* The following four macros produce codes (control character or
4199    escape sequence) for ISO2022 locking-shift functions (shift-in,
4200    shift-out, locking-shift-2, and locking-shift-3).  */
4201
4202 #define ENCODE_SHIFT_IN                                 \
4203   do {                                                  \
4204     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4205     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4206   } while (0)
4207
4208
4209 #define ENCODE_SHIFT_OUT                                \
4210   do {                                                  \
4211     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4212     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4213   } while (0)
4214
4215
4216 #define ENCODE_LOCKING_SHIFT_2                          \
4217   do {                                                  \
4218     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4219     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4220   } while (0)
4221
4222
4223 #define ENCODE_LOCKING_SHIFT_3                          \
4224   do {                                                  \
4225     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4226     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4227   } while (0)
4228
4229
4230 /* Produce codes for a DIMENSION1 character whose character set is
4231    CHARSET and whose position-code is C1.  Designation and invocation
4232    sequences are also produced in advance if necessary.  */
4233
4234 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4235   do {                                                                  \
4236     int id = CHARSET_ID (charset);                                      \
4237                                                                         \
4238     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4239         && id == charset_ascii)                                         \
4240       {                                                                 \
4241         id = charset_jisx0201_roman;                                    \
4242         charset = CHARSET_FROM_ID (id);                                 \
4243       }                                                                 \
4244                                                                         \
4245     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4246       {                                                                 \
4247         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4248           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4249         else                                                            \
4250           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4251         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4252         break;                                                          \
4253       }                                                                 \
4254     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4255       {                                                                 \
4256         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4257         break;                                                          \
4258       }                                                                 \
4259     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4260       {                                                                 \
4261         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4262         break;                                                          \
4263       }                                                                 \
4264     else                                                                \
4265       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4266          must invoke it, or, at first, designate it to some graphic     \
4267          register.  Then repeat the loop to actually produce the        \
4268          character.  */                                                 \
4269       dst = encode_invocation_designation (charset, coding, dst,        \
4270                                            &produced_chars);            \
4271   } while (1)
4272
4273
4274 /* Produce codes for a DIMENSION2 character whose character set is
4275    CHARSET and whose position-codes are C1 and C2.  Designation and
4276    invocation codes are also produced in advance if necessary.  */
4277
4278 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4279   do {                                                                  \
4280     int id = CHARSET_ID (charset);                                      \
4281                                                                         \
4282     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4283         && id == charset_jisx0208)                                      \
4284       {                                                                 \
4285         id = charset_jisx0208_1978;                                     \
4286         charset = CHARSET_FROM_ID (id);                                 \
4287       }                                                                 \
4288                                                                         \
4289     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4290       {                                                                 \
4291         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4292           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4293         else                                                            \
4294           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4295         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4296         break;                                                          \
4297       }                                                                 \
4298     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4299       {                                                                 \
4300         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4301         break;                                                          \
4302       }                                                                 \
4303     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4304       {                                                                 \
4305         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4306         break;                                                          \
4307       }                                                                 \
4308     else                                                                \
4309       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4310          must invoke it, or, at first, designate it to some graphic     \
4311          register.  Then repeat the loop to actually produce the        \
4312          character.  */                                                 \
4313       dst = encode_invocation_designation (charset, coding, dst,        \
4314                                            &produced_chars);            \
4315   } while (1)
4316
4317
4318 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4319   do {                                                                     \
4320     int code = ENCODE_CHAR ((charset),(c));                                \
4321                                                                            \
4322     if (CHARSET_DIMENSION (charset) == 1)                                  \
4323       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4324     else                                                                   \
4325       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4326   } while (0)
4327
4328
4329 /* Produce designation and invocation codes at a place pointed by DST
4330    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4331    Return new DST.  */
4332
4333 unsigned char *
4334 encode_invocation_designation (charset, coding, dst, p_nchars)
4335      struct charset *charset;
4336      struct coding_system *coding;
4337      unsigned char *dst;
4338      int *p_nchars;
4339 {
4340   int multibytep = coding->dst_multibyte;
4341   int produced_chars = *p_nchars;
4342   int reg;                      /* graphic register number */
4343   int id = CHARSET_ID (charset);
4344
4345   /* At first, check designations.  */
4346   for (reg = 0; reg < 4; reg++)
4347     if (id == CODING_ISO_DESIGNATION (coding, reg))
4348       break;
4349
4350   if (reg >= 4)
4351     {
4352       /* CHARSET is not yet designated to any graphic registers.  */
4353       /* At first check the requested designation.  */
4354       reg = CODING_ISO_REQUEST (coding, id);
4355       if (reg < 0)
4356         /* Since CHARSET requests no special designation, designate it
4357            to graphic register 0.  */
4358         reg = 0;
4359
4360       ENCODE_DESIGNATION (charset, reg, coding);
4361     }
4362
4363   if (CODING_ISO_INVOCATION (coding, 0) != reg
4364       && CODING_ISO_INVOCATION (coding, 1) != reg)
4365     {
4366       /* Since the graphic register REG is not invoked to any graphic
4367          planes, invoke it to graphic plane 0.  */
4368       switch (reg)
4369         {
4370         case 0:                 /* graphic register 0 */
4371           ENCODE_SHIFT_IN;
4372           break;
4373
4374         case 1:                 /* graphic register 1 */
4375           ENCODE_SHIFT_OUT;
4376           break;
4377
4378         case 2:                 /* graphic register 2 */
4379           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4380             ENCODE_SINGLE_SHIFT_2;
4381           else
4382             ENCODE_LOCKING_SHIFT_2;
4383           break;
4384
4385         case 3:                 /* graphic register 3 */
4386           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4387             ENCODE_SINGLE_SHIFT_3;
4388           else
4389             ENCODE_LOCKING_SHIFT_3;
4390           break;
4391         }
4392     }
4393
4394   *p_nchars = produced_chars;
4395   return dst;
4396 }
4397
4398 /* The following three macros produce codes for indicating direction
4399    of text.  */
4400 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4401   do {                                                                  \
4402     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4403       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4404     else                                                                \
4405       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4406   } while (0)
4407
4408
4409 #define ENCODE_DIRECTION_R2L()                  \
4410   do {                                          \
4411     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4412     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4413   } while (0)
4414
4415
4416 #define ENCODE_DIRECTION_L2R()                  \
4417   do {                                          \
4418     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4419     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4420   } while (0)
4421
4422
4423 /* Produce codes for designation and invocation to reset the graphic
4424    planes and registers to initial state.  */
4425 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4426   do {                                                                  \
4427     int reg;                                                            \
4428     struct charset *charset;                                            \
4429                                                                         \
4430     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4431       ENCODE_SHIFT_IN;                                                  \
4432     for (reg = 0; reg < 4; reg++)                                       \
4433       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4434           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4435               != CODING_ISO_INITIAL (coding, reg)))                     \
4436         {                                                               \
4437           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4438           ENCODE_DESIGNATION (charset, reg, coding);                    \
4439         }                                                               \
4440   } while (0)
4441
4442
4443 /* Produce designation sequences of charsets in the line started from
4444    SRC to a place pointed by DST, and return updated DST.
4445
4446    If the current block ends before any end-of-line, we may fail to
4447    find all the necessary designations.  */
4448
4449 static unsigned char *
4450 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4451      struct coding_system *coding;
4452      int *charbuf, *charbuf_end;
4453      unsigned char *dst;
4454 {
4455   struct charset *charset;
4456   /* Table of charsets to be designated to each graphic register.  */
4457   int r[4];
4458   int c, found = 0, reg;
4459   int produced_chars = 0;
4460   int multibytep = coding->dst_multibyte;
4461   Lisp_Object attrs;
4462   Lisp_Object charset_list;
4463
4464   attrs = CODING_ID_ATTRS (coding->id);
4465   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4466   if (EQ (charset_list, Qiso_2022))
4467     charset_list = Viso_2022_charset_list;
4468
4469   for (reg = 0; reg < 4; reg++)
4470     r[reg] = -1;
4471
4472   while (found < 4)
4473     {
4474       int id;
4475
4476       c = *charbuf++;
4477       if (c == '\n')
4478         break;
4479       charset = char_charset (c, charset_list, NULL);
4480       id = CHARSET_ID (charset);
4481       reg = CODING_ISO_REQUEST (coding, id);
4482       if (reg >= 0 && r[reg] < 0)
4483         {
4484           found++;
4485           r[reg] = id;
4486         }
4487     }
4488
4489   if (found)
4490     {
4491       for (reg = 0; reg < 4; reg++)
4492         if (r[reg] >= 0
4493             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4494           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4495     }
4496
4497   return dst;
4498 }
4499
4500 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4501
4502 static int
4503 encode_coding_iso_2022 (coding)
4504      struct coding_system *coding;
4505 {
4506   int multibytep = coding->dst_multibyte;
4507   int *charbuf = coding->charbuf;
4508   int *charbuf_end = charbuf + coding->charbuf_used;
4509   unsigned char *dst = coding->destination + coding->produced;
4510   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4511   int safe_room = 16;
4512   int bol_designation
4513     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4514        && CODING_ISO_BOL (coding));
4515   int produced_chars = 0;
4516   Lisp_Object attrs, eol_type, charset_list;
4517   int ascii_compatible;
4518   int c;
4519   int preferred_charset_id = -1;
4520
4521   CODING_GET_INFO (coding, attrs, charset_list);
4522   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4523   if (VECTORP (eol_type))
4524     eol_type = Qunix;
4525
4526   setup_iso_safe_charsets (attrs);
4527   /* Charset list may have been changed.  */
4528   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4529   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4530
4531   ascii_compatible
4532     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4533        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4534                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4535
4536   while (charbuf < charbuf_end)
4537     {
4538       ASSURE_DESTINATION (safe_room);
4539
4540       if (bol_designation)
4541         {
4542           unsigned char *dst_prev = dst;
4543
4544           /* We have to produce designation sequences if any now.  */
4545           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4546           bol_designation = 0;
4547           /* We are sure that designation sequences are all ASCII bytes.  */
4548           produced_chars += dst - dst_prev;
4549         }
4550
4551       c = *charbuf++;
4552
4553       if (c < 0)
4554         {
4555           /* Handle an annotation.  */
4556           switch (*charbuf)
4557             {
4558             case CODING_ANNOTATE_COMPOSITION_MASK:
4559               /* Not yet implemented.  */
4560               break;
4561             case CODING_ANNOTATE_CHARSET_MASK:
4562               preferred_charset_id = charbuf[2];
4563               if (preferred_charset_id >= 0
4564                   && NILP (Fmemq (make_number (preferred_charset_id),
4565                                   charset_list)))
4566                 preferred_charset_id = -1;
4567               break;
4568             default:
4569               abort ();
4570             }
4571           charbuf += -c - 1;
4572           continue;
4573         }
4574
4575       /* Now encode the character C.  */
4576       if (c < 0x20 || c == 0x7F)
4577         {
4578           if (c == '\n'
4579               || (c == '\r' && EQ (eol_type, Qmac)))
4580             {
4581               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4582                 ENCODE_RESET_PLANE_AND_REGISTER ();
4583               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4584                 {
4585                   int i;
4586
4587                   for (i = 0; i < 4; i++)
4588                     CODING_ISO_DESIGNATION (coding, i)
4589                       = CODING_ISO_INITIAL (coding, i);
4590                 }
4591               bol_designation
4592                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4593             }
4594           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4595             ENCODE_RESET_PLANE_AND_REGISTER ();
4596           EMIT_ONE_ASCII_BYTE (c);
4597         }
4598       else if (ASCII_CHAR_P (c))
4599         {
4600           if (ascii_compatible)
4601             EMIT_ONE_ASCII_BYTE (c);
4602           else
4603             {
4604               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4605               ENCODE_ISO_CHARACTER (charset, c);
4606             }
4607         }
4608       else if (CHAR_BYTE8_P (c))
4609         {
4610           c = CHAR_TO_BYTE8 (c);
4611           EMIT_ONE_BYTE (c);
4612         }
4613       else
4614         {
4615           struct charset *charset;
4616
4617           if (preferred_charset_id >= 0)
4618             {
4619               charset = CHARSET_FROM_ID (preferred_charset_id);
4620               if (! CHAR_CHARSET_P (c, charset))
4621                 charset = char_charset (c, charset_list, NULL);
4622             }
4623           else
4624             charset = char_charset (c, charset_list, NULL);
4625           if (!charset)
4626             {
4627               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4628                 {
4629                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4630                   charset = CHARSET_FROM_ID (charset_ascii);
4631                 }
4632               else
4633                 {
4634                   c = coding->default_char;
4635                   charset = char_charset (c, charset_list, NULL);
4636                 }
4637             }
4638           ENCODE_ISO_CHARACTER (charset, c);
4639         }
4640     }
4641
4642   if (coding->mode & CODING_MODE_LAST_BLOCK
4643       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4644     {
4645       ASSURE_DESTINATION (safe_room);
4646       ENCODE_RESET_PLANE_AND_REGISTER ();
4647     }
4648   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4649   CODING_ISO_BOL (coding) = bol_designation;
4650   coding->produced_char += produced_chars;
4651   coding->produced = dst - coding->destination;
4652   return 0;
4653 }
4654
4655 \f
4656 /*** 8,9. SJIS and BIG5 handlers ***/
4657
4658 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4659    quite widely.  So, for the moment, Emacs supports them in the bare
4660    C code.  But, in the future, they may be supported only by CCL.  */
4661
4662 /* SJIS is a coding system encoding three character sets: ASCII, right
4663    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4664    as is.  A character of charset katakana-jisx0201 is encoded by
4665    "position-code + 0x80".  A character of charset japanese-jisx0208
4666    is encoded in 2-byte but two position-codes are divided and shifted
4667    so that it fit in the range below.
4668
4669    --- CODE RANGE of SJIS ---
4670    (character set)      (range)
4671    ASCII                0x00 .. 0x7F
4672    KATAKANA-JISX0201    0xA0 .. 0xDF
4673    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4674             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4675    -------------------------------
4676
4677 */
4678
4679 /* BIG5 is a coding system encoding two character sets: ASCII and
4680    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4681    character set and is encoded in two-byte.
4682
4683    --- CODE RANGE of BIG5 ---
4684    (character set)      (range)
4685    ASCII                0x00 .. 0x7F
4686    Big5 (1st byte)      0xA1 .. 0xFE
4687         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4688    --------------------------
4689
4690   */
4691
4692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4693    Check if a text is encoded in SJIS.  If it is, return
4694    CATEGORY_MASK_SJIS, else return 0.  */
4695
4696 static int
4697 detect_coding_sjis (coding, detect_info)
4698      struct coding_system *coding;
4699      struct coding_detection_info *detect_info;
4700 {
4701   const unsigned char *src = coding->source, *src_base;
4702   const unsigned char *src_end = coding->source + coding->src_bytes;
4703   int multibytep = coding->src_multibyte;
4704   int consumed_chars = 0;
4705   int found = 0;
4706   int c;
4707   Lisp_Object attrs, charset_list;
4708   int max_first_byte_of_2_byte_code;
4709
4710   CODING_GET_INFO (coding, attrs, charset_list);
4711   max_first_byte_of_2_byte_code
4712     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4713
4714   detect_info->checked |= CATEGORY_MASK_SJIS;
4715   /* A coding system of this category is always ASCII compatible.  */
4716   src += coding->head_ascii;
4717
4718   while (1)
4719     {
4720       src_base = src;
4721       ONE_MORE_BYTE (c);
4722       if (c < 0x80)
4723         continue;
4724       if ((c >= 0x81 && c <= 0x9F)
4725           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4726         {
4727           ONE_MORE_BYTE (c);
4728           if (c < 0x40 || c == 0x7F || c > 0xFC)
4729             break;
4730           found = CATEGORY_MASK_SJIS;
4731         }
4732       else if (c >= 0xA0 && c < 0xE0)
4733         found = CATEGORY_MASK_SJIS;
4734       else
4735         break;
4736     }
4737   detect_info->rejected |= CATEGORY_MASK_SJIS;
4738   return 0;
4739
4740  no_more_source:
4741   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4742     {
4743       detect_info->rejected |= CATEGORY_MASK_SJIS;
4744       return 0;
4745     }
4746   detect_info->found |= found;
4747   return 1;
4748 }
4749
4750 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4751    Check if a text is encoded in BIG5.  If it is, return
4752    CATEGORY_MASK_BIG5, else return 0.  */
4753
4754 static int
4755 detect_coding_big5 (coding, detect_info)
4756      struct coding_system *coding;
4757      struct coding_detection_info *detect_info;
4758 {
4759   const unsigned char *src = coding->source, *src_base;
4760   const unsigned char *src_end = coding->source + coding->src_bytes;
4761   int multibytep = coding->src_multibyte;
4762   int consumed_chars = 0;
4763   int found = 0;
4764   int c;
4765
4766   detect_info->checked |= CATEGORY_MASK_BIG5;
4767   /* A coding system of this category is always ASCII compatible.  */
4768   src += coding->head_ascii;
4769
4770   while (1)
4771     {
4772       src_base = src;
4773       ONE_MORE_BYTE (c);
4774       if (c < 0x80)
4775         continue;
4776       if (c >= 0xA1)
4777         {
4778           ONE_MORE_BYTE (c);
4779           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4780             return 0;
4781           found = CATEGORY_MASK_BIG5;
4782         }
4783       else
4784         break;
4785     }
4786   detect_info->rejected |= CATEGORY_MASK_BIG5;
4787   return 0;
4788
4789  no_more_source:
4790   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4791     {
4792       detect_info->rejected |= CATEGORY_MASK_BIG5;
4793       return 0;
4794     }
4795   detect_info->found |= found;
4796   return 1;
4797 }
4798
4799 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4800    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4801
4802 static void
4803 decode_coding_sjis (coding)
4804      struct coding_system *coding;
4805 {
4806   const unsigned char *src = coding->source + coding->consumed;
4807   const unsigned char *src_end = coding->source + coding->src_bytes;
4808   const unsigned char *src_base;
4809   int *charbuf = coding->charbuf + coding->charbuf_used;
4810   /* We may produce one charset annocation in one loop and one more at
4811      the end.  */
4812   int *charbuf_end
4813     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4814   int consumed_chars = 0, consumed_chars_base;
4815   int multibytep = coding->src_multibyte;
4816   struct charset *charset_roman, *charset_kanji, *charset_kana;
4817   struct charset *charset_kanji2;
4818   Lisp_Object attrs, charset_list, val;
4819   int char_offset = coding->produced_char;
4820   int last_offset = char_offset;
4821   int last_id = charset_ascii;
4822   int eol_crlf =
4823     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4824   int byte_after_cr = -1;
4825
4826   CODING_GET_INFO (coding, attrs, charset_list);
4827
4828   val = charset_list;
4829   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4830   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4831   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4832   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4833
4834   while (1)
4835     {
4836       int c, c1;
4837       struct charset *charset;
4838
4839       src_base = src;
4840       consumed_chars_base = consumed_chars;
4841
4842       if (charbuf >= charbuf_end)
4843         {
4844           if (byte_after_cr >= 0)
4845             src_base--;
4846           break;
4847         }
4848
4849       if (byte_after_cr >= 0)
4850         c = byte_after_cr, byte_after_cr = -1;
4851       else
4852         ONE_MORE_BYTE (c);
4853       if (c < 0)
4854         goto invalid_code;
4855       if (c < 0x80)
4856         {
4857           if (eol_crlf && c == '\r')
4858             ONE_MORE_BYTE (byte_after_cr);
4859           charset = charset_roman;
4860         }
4861       else if (c == 0x80 || c == 0xA0)
4862         goto invalid_code;
4863       else if (c >= 0xA1 && c <= 0xDF)
4864         {
4865           /* SJIS -> JISX0201-Kana */
4866           c &= 0x7F;
4867           charset = charset_kana;
4868         }
4869       else if (c <= 0xEF)
4870         {
4871           /* SJIS -> JISX0208 */
4872           ONE_MORE_BYTE (c1);
4873           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4874             goto invalid_code;
4875           c = (c << 8) | c1;
4876           SJIS_TO_JIS (c);
4877           charset = charset_kanji;
4878         }
4879       else if (c <= 0xFC && charset_kanji2)
4880         {
4881           /* SJIS -> JISX0213-2 */
4882           ONE_MORE_BYTE (c1);
4883           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4884             goto invalid_code;
4885           c = (c << 8) | c1;
4886           SJIS_TO_JIS2 (c);
4887           charset = charset_kanji2;
4888         }
4889       else
4890         goto invalid_code;
4891       if (charset->id != charset_ascii
4892           && last_id != charset->id)
4893         {
4894           if (last_id != charset_ascii)
4895             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4896           last_id = charset->id;
4897           last_offset = char_offset;
4898         }
4899       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4900       *charbuf++ = c;
4901       char_offset++;
4902       continue;
4903
4904     invalid_code:
4905       src = src_base;
4906       consumed_chars = consumed_chars_base;
4907       ONE_MORE_BYTE (c);
4908       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4909       char_offset++;
4910       coding->errors++;
4911     }
4912
4913  no_more_source:
4914   if (last_id != charset_ascii)
4915     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4916   coding->consumed_char += consumed_chars_base;
4917   coding->consumed = src_base - coding->source;
4918   coding->charbuf_used = charbuf - coding->charbuf;
4919 }
4920
4921 static void
4922 decode_coding_big5 (coding)
4923      struct coding_system *coding;
4924 {
4925   const unsigned char *src = coding->source + coding->consumed;
4926   const unsigned char *src_end = coding->source + coding->src_bytes;
4927   const unsigned char *src_base;
4928   int *charbuf = coding->charbuf + coding->charbuf_used;
4929   /* We may produce one charset annocation in one loop and one more at
4930      the end.  */
4931   int *charbuf_end
4932     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4933   int consumed_chars = 0, consumed_chars_base;
4934   int multibytep = coding->src_multibyte;
4935   struct charset *charset_roman, *charset_big5;
4936   Lisp_Object attrs, charset_list, val;
4937   int char_offset = coding->produced_char;
4938   int last_offset = char_offset;
4939   int last_id = charset_ascii;
4940   int eol_crlf =
4941     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4942   int byte_after_cr = -1;
4943
4944   CODING_GET_INFO (coding, attrs, charset_list);
4945   val = charset_list;
4946   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4947   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4948
4949   while (1)
4950     {
4951       int c, c1;
4952       struct charset *charset;
4953
4954       src_base = src;
4955       consumed_chars_base = consumed_chars;
4956
4957       if (charbuf >= charbuf_end)
4958         {
4959           if (byte_after_cr >= 0)
4960             src_base--;
4961           break;
4962         }
4963
4964       if (byte_after_cr >= 0)
4965         c = byte_after_cr, byte_after_cr = -1;
4966       else
4967         ONE_MORE_BYTE (c);
4968
4969       if (c < 0)
4970         goto invalid_code;
4971       if (c < 0x80)
4972         {
4973           if (eol_crlf && c == '\r')
4974             ONE_MORE_BYTE (byte_after_cr);
4975           charset = charset_roman;
4976         }
4977       else
4978         {
4979           /* BIG5 -> Big5 */
4980           if (c < 0xA1 || c > 0xFE)
4981             goto invalid_code;
4982           ONE_MORE_BYTE (c1);
4983           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4984             goto invalid_code;
4985           c = c << 8 | c1;
4986           charset = charset_big5;
4987         }
4988       if (charset->id != charset_ascii
4989           && last_id != charset->id)
4990         {
4991           if (last_id != charset_ascii)
4992             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4993           last_id = charset->id;
4994           last_offset = char_offset;
4995         }
4996       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4997       *charbuf++ = c;
4998       char_offset++;
4999       continue;
5000
5001     invalid_code:
5002       src = src_base;
5003       consumed_chars = consumed_chars_base;
5004       ONE_MORE_BYTE (c);
5005       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
5006       char_offset++;
5007       coding->errors++;
5008     }
5009
5010  no_more_source:
5011   if (last_id != charset_ascii)
5012     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5013   coding->consumed_char += consumed_chars_base;
5014   coding->consumed = src_base - coding->source;
5015   coding->charbuf_used = charbuf - coding->charbuf;
5016 }
5017
5018 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
5019    This function can encode charsets `ascii', `katakana-jisx0201',
5020    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
5021    are sure that all these charsets are registered as official charset
5022    (i.e. do not have extended leading-codes).  Characters of other
5023    charsets are produced without any encoding.  If SJIS_P is 1, encode
5024    SJIS text, else encode BIG5 text.  */
5025
5026 static int
5027 encode_coding_sjis (coding)
5028      struct coding_system *coding;
5029 {
5030   int multibytep = coding->dst_multibyte;
5031   int *charbuf = coding->charbuf;
5032   int *charbuf_end = charbuf + coding->charbuf_used;
5033   unsigned char *dst = coding->destination + coding->produced;
5034   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5035   int safe_room = 4;
5036   int produced_chars = 0;
5037   Lisp_Object attrs, charset_list, val;
5038   int ascii_compatible;
5039   struct charset *charset_roman, *charset_kanji, *charset_kana;
5040   struct charset *charset_kanji2;
5041   int c;
5042
5043   CODING_GET_INFO (coding, attrs, charset_list);
5044   val = charset_list;
5045   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5046   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5047   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5048   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5049
5050   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5051
5052   while (charbuf < charbuf_end)
5053     {
5054       ASSURE_DESTINATION (safe_room);
5055       c = *charbuf++;
5056       /* Now encode the character C.  */
5057       if (ASCII_CHAR_P (c) && ascii_compatible)
5058         EMIT_ONE_ASCII_BYTE (c);
5059       else if (CHAR_BYTE8_P (c))
5060         {
5061           c = CHAR_TO_BYTE8 (c);
5062           EMIT_ONE_BYTE (c);
5063         }
5064       else
5065         {
5066           unsigned code;
5067           struct charset *charset = char_charset (c, charset_list, &code);
5068
5069           if (!charset)
5070             {
5071               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5072                 {
5073                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5074                   charset = CHARSET_FROM_ID (charset_ascii);
5075                 }
5076               else
5077                 {
5078                   c = coding->default_char;
5079                   charset = char_charset (c, charset_list, &code);
5080                 }
5081             }
5082           if (code == CHARSET_INVALID_CODE (charset))
5083             abort ();
5084           if (charset == charset_kanji)
5085             {
5086               int c1, c2;
5087               JIS_TO_SJIS (code);
5088               c1 = code >> 8, c2 = code & 0xFF;
5089               EMIT_TWO_BYTES (c1, c2);
5090             }
5091           else if (charset == charset_kana)
5092             EMIT_ONE_BYTE (code | 0x80);
5093           else if (charset_kanji2 && charset == charset_kanji2)
5094             {
5095               int c1, c2;
5096
5097               c1 = code >> 8;
5098               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5099                   || c1 == 0x28
5100                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5101                 {
5102                   JIS_TO_SJIS2 (code);
5103                   c1 = code >> 8, c2 = code & 0xFF;
5104                   EMIT_TWO_BYTES (c1, c2);
5105                 }
5106               else
5107                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5108             }
5109           else
5110             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5111         }
5112     }
5113   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5114   coding->produced_char += produced_chars;
5115   coding->produced = dst - coding->destination;
5116   return 0;
5117 }
5118
5119 static int
5120 encode_coding_big5 (coding)
5121      struct coding_system *coding;
5122 {
5123   int multibytep = coding->dst_multibyte;
5124   int *charbuf = coding->charbuf;
5125   int *charbuf_end = charbuf + coding->charbuf_used;
5126   unsigned char *dst = coding->destination + coding->produced;
5127   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5128   int safe_room = 4;
5129   int produced_chars = 0;
5130   Lisp_Object attrs, charset_list, val;
5131   int ascii_compatible;
5132   struct charset *charset_roman, *charset_big5;
5133   int c;
5134
5135   CODING_GET_INFO (coding, attrs, charset_list);
5136   val = charset_list;
5137   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5138   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5139   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5140
5141   while (charbuf < charbuf_end)
5142     {
5143       ASSURE_DESTINATION (safe_room);
5144       c = *charbuf++;
5145       /* Now encode the character C.  */
5146       if (ASCII_CHAR_P (c) && ascii_compatible)
5147         EMIT_ONE_ASCII_BYTE (c);
5148       else if (CHAR_BYTE8_P (c))
5149         {
5150           c = CHAR_TO_BYTE8 (c);
5151           EMIT_ONE_BYTE (c);
5152         }
5153       else
5154         {
5155           unsigned code;
5156           struct charset *charset = char_charset (c, charset_list, &code);
5157
5158           if (! charset)
5159             {
5160               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5161                 {
5162                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5163                   charset = CHARSET_FROM_ID (charset_ascii);
5164                 }
5165               else
5166                 {
5167                   c = coding->default_char;
5168                   charset = char_charset (c, charset_list, &code);
5169                 }
5170             }
5171           if (code == CHARSET_INVALID_CODE (charset))
5172             abort ();
5173           if (charset == charset_big5)
5174             {
5175               int c1, c2;
5176
5177               c1 = code >> 8, c2 = code & 0xFF;
5178               EMIT_TWO_BYTES (c1, c2);
5179             }
5180           else
5181             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5182         }
5183     }
5184   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5185   coding->produced_char += produced_chars;
5186   coding->produced = dst - coding->destination;
5187   return 0;
5188 }
5189
5190 \f
5191 /*** 10. CCL handlers ***/
5192
5193 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5194    Check if a text is encoded in a coding system of which
5195    encoder/decoder are written in CCL program.  If it is, return
5196    CATEGORY_MASK_CCL, else return 0.  */
5197
5198 static int
5199 detect_coding_ccl (coding, detect_info)
5200      struct coding_system *coding;
5201      struct coding_detection_info *detect_info;
5202 {
5203   const unsigned char *src = coding->source, *src_base;
5204   const unsigned char *src_end = coding->source + coding->src_bytes;
5205   int multibytep = coding->src_multibyte;
5206   int consumed_chars = 0;
5207   int found = 0;
5208   unsigned char *valids;
5209   int head_ascii = coding->head_ascii;
5210   Lisp_Object attrs;
5211
5212   detect_info->checked |= CATEGORY_MASK_CCL;
5213
5214   coding = &coding_categories[coding_category_ccl];
5215   valids = CODING_CCL_VALIDS (coding);
5216   attrs = CODING_ID_ATTRS (coding->id);
5217   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5218     src += head_ascii;
5219
5220   while (1)
5221     {
5222       int c;
5223
5224       src_base = src;
5225       ONE_MORE_BYTE (c);
5226       if (c < 0 || ! valids[c])
5227         break;
5228       if ((valids[c] > 1))
5229         found = CATEGORY_MASK_CCL;
5230     }
5231   detect_info->rejected |= CATEGORY_MASK_CCL;
5232   return 0;
5233
5234  no_more_source:
5235   detect_info->found |= found;
5236   return 1;
5237 }
5238
5239 static void
5240 decode_coding_ccl (coding)
5241      struct coding_system *coding;
5242 {
5243   const unsigned char *src = coding->source + coding->consumed;
5244   const unsigned char *src_end = coding->source + coding->src_bytes;
5245   int *charbuf = coding->charbuf + coding->charbuf_used;
5246   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5247   int consumed_chars = 0;
5248   int multibytep = coding->src_multibyte;
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   int source_charbuf[1024];
5251   int source_byteidx[1025];
5252   Lisp_Object attrs, charset_list;
5253
5254   CODING_GET_INFO (coding, attrs, charset_list);
5255
5256   while (1)
5257     {
5258       const unsigned char *p = src;
5259       int i = 0;
5260
5261       if (multibytep)
5262         {
5263           while (i < 1024 && p < src_end)
5264             {
5265               source_byteidx[i] = p - src;
5266               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5267             }
5268           source_byteidx[i] = p - src;
5269         }
5270       else
5271         while (i < 1024 && p < src_end)
5272           source_charbuf[i++] = *p++;
5273
5274       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5275         ccl->last_block = 1;
5276       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5277                   charset_list);
5278       charbuf += ccl->produced;
5279       if (multibytep)
5280         src += source_byteidx[ccl->consumed];
5281       else
5282         src += ccl->consumed;
5283       consumed_chars += ccl->consumed;
5284       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5285         break;
5286     }
5287
5288   switch (ccl->status)
5289     {
5290     case CCL_STAT_SUSPEND_BY_SRC:
5291       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5292       break;
5293     case CCL_STAT_SUSPEND_BY_DST:
5294       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5295       break;
5296     case CCL_STAT_QUIT:
5297     case CCL_STAT_INVALID_CMD:
5298       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5299       break;
5300     default:
5301       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5302       break;
5303     }
5304   coding->consumed_char += consumed_chars;
5305   coding->consumed = src - coding->source;
5306   coding->charbuf_used = charbuf - coding->charbuf;
5307 }
5308
5309 static int
5310 encode_coding_ccl (coding)
5311      struct coding_system *coding;
5312 {
5313   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5314   int multibytep = coding->dst_multibyte;
5315   int *charbuf = coding->charbuf;
5316   int *charbuf_end = charbuf + coding->charbuf_used;
5317   unsigned char *dst = coding->destination + coding->produced;
5318   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5319   int destination_charbuf[1024];
5320   int i, produced_chars = 0;
5321   Lisp_Object attrs, charset_list;
5322
5323   CODING_GET_INFO (coding, attrs, charset_list);
5324   if (coding->consumed_char == coding->src_chars
5325       && coding->mode & CODING_MODE_LAST_BLOCK)
5326     ccl->last_block = 1;
5327
5328   while (charbuf < charbuf_end)
5329     {
5330       ccl_driver (ccl, charbuf, destination_charbuf,
5331                   charbuf_end - charbuf, 1024, charset_list);
5332       if (multibytep)
5333         {
5334           ASSURE_DESTINATION (ccl->produced * 2);
5335           for (i = 0; i < ccl->produced; i++)
5336             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5337         }
5338       else
5339         {
5340           ASSURE_DESTINATION (ccl->produced);
5341           for (i = 0; i < ccl->produced; i++)
5342             *dst++ = destination_charbuf[i] & 0xFF;
5343           produced_chars += ccl->produced;
5344         }
5345       charbuf += ccl->consumed;
5346       if (ccl->status == CCL_STAT_QUIT
5347           || ccl->status == CCL_STAT_INVALID_CMD)
5348         break;
5349     }
5350
5351   switch (ccl->status)
5352     {
5353     case CCL_STAT_SUSPEND_BY_SRC:
5354       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5355       break;
5356     case CCL_STAT_SUSPEND_BY_DST:
5357       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5358       break;
5359     case CCL_STAT_QUIT:
5360     case CCL_STAT_INVALID_CMD:
5361       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5362       break;
5363     default:
5364       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5365       break;
5366     }
5367
5368   coding->produced_char += produced_chars;
5369   coding->produced = dst - coding->destination;
5370   return 0;
5371 }
5372
5373
5374 \f
5375 /*** 10, 11. no-conversion handlers ***/
5376
5377 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5378
5379 static void
5380 decode_coding_raw_text (coding)
5381      struct coding_system *coding;
5382 {
5383   int eol_crlf =
5384     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5385
5386   coding->chars_at_source = 1;
5387   coding->consumed_char = coding->src_chars;
5388   coding->consumed = coding->src_bytes;
5389   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5390     {
5391       coding->consumed_char--;
5392       coding->consumed--;
5393       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5394     }
5395   else
5396     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5397 }
5398
5399 static int
5400 encode_coding_raw_text (coding)
5401      struct coding_system *coding;
5402 {
5403   int multibytep = coding->dst_multibyte;
5404   int *charbuf = coding->charbuf;
5405   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5406   unsigned char *dst = coding->destination + coding->produced;
5407   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5408   int produced_chars = 0;
5409   int c;
5410
5411   if (multibytep)
5412     {
5413       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5414
5415       if (coding->src_multibyte)
5416         while (charbuf < charbuf_end)
5417           {
5418             ASSURE_DESTINATION (safe_room);
5419             c = *charbuf++;
5420             if (ASCII_CHAR_P (c))
5421               EMIT_ONE_ASCII_BYTE (c);
5422             else if (CHAR_BYTE8_P (c))
5423               {
5424                 c = CHAR_TO_BYTE8 (c);
5425                 EMIT_ONE_BYTE (c);
5426               }
5427             else
5428               {
5429                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5430
5431                 CHAR_STRING_ADVANCE (c, p1);
5432                 while (p0 < p1)
5433                   {
5434                     EMIT_ONE_BYTE (*p0);
5435                     p0++;
5436                   }
5437               }
5438           }
5439       else
5440         while (charbuf < charbuf_end)
5441           {
5442             ASSURE_DESTINATION (safe_room);
5443             c = *charbuf++;
5444             EMIT_ONE_BYTE (c);
5445           }
5446     }
5447   else
5448     {
5449       if (coding->src_multibyte)
5450         {
5451           int safe_room = MAX_MULTIBYTE_LENGTH;
5452
5453           while (charbuf < charbuf_end)
5454             {
5455               ASSURE_DESTINATION (safe_room);
5456               c = *charbuf++;
5457               if (ASCII_CHAR_P (c))
5458                 *dst++ = c;
5459               else if (CHAR_BYTE8_P (c))
5460                 *dst++ = CHAR_TO_BYTE8 (c);
5461               else
5462                 CHAR_STRING_ADVANCE (c, dst);
5463             }
5464         }
5465       else
5466         {
5467           ASSURE_DESTINATION (charbuf_end - charbuf);
5468           while (charbuf < charbuf_end && dst < dst_end)
5469             *dst++ = *charbuf++;
5470         }
5471       produced_chars = dst - (coding->destination + coding->produced);
5472     }
5473   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5474   coding->produced_char += produced_chars;
5475   coding->produced = dst - coding->destination;
5476   return 0;
5477 }
5478
5479 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5480    Check if a text is encoded in a charset-based coding system.  If it
5481    is, return 1, else return 0.  */
5482
5483 static int
5484 detect_coding_charset (coding, detect_info)
5485      struct coding_system *coding;
5486      struct coding_detection_info *detect_info;
5487 {
5488   const unsigned char *src = coding->source, *src_base;
5489   const unsigned char *src_end = coding->source + coding->src_bytes;
5490   int multibytep = coding->src_multibyte;
5491   int consumed_chars = 0;
5492   Lisp_Object attrs, valids, name;
5493   int found = 0;
5494   int head_ascii = coding->head_ascii;
5495   int check_latin_extra = 0;
5496
5497   detect_info->checked |= CATEGORY_MASK_CHARSET;
5498
5499   coding = &coding_categories[coding_category_charset];
5500   attrs = CODING_ID_ATTRS (coding->id);
5501   valids = AREF (attrs, coding_attr_charset_valids);
5502   name = CODING_ID_NAME (coding->id);
5503   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5504                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5505       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5506                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5507     check_latin_extra = 1;
5508
5509   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5510     src += head_ascii;
5511
5512   while (1)
5513     {
5514       int c;
5515       Lisp_Object val;
5516       struct charset *charset;
5517       int dim, idx;
5518
5519       src_base = src;
5520       ONE_MORE_BYTE (c);
5521       if (c < 0)
5522         continue;
5523       val = AREF (valids, c);
5524       if (NILP (val))
5525         break;
5526       if (c >= 0x80)
5527         {
5528           if (c < 0xA0
5529               && check_latin_extra
5530               && (!VECTORP (Vlatin_extra_code_table)
5531                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5532             break;
5533           found = CATEGORY_MASK_CHARSET;
5534         }
5535       if (INTEGERP (val))
5536         {
5537           charset = CHARSET_FROM_ID (XFASTINT (val));
5538           dim = CHARSET_DIMENSION (charset);
5539           for (idx = 1; idx < dim; idx++)
5540             {
5541               if (src == src_end)
5542                 goto too_short;
5543               ONE_MORE_BYTE (c);
5544               if (c < charset->code_space[(dim - 1 - idx) * 2]
5545                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5546                 break;
5547             }
5548           if (idx < dim)
5549             break;
5550         }
5551       else
5552         {
5553           idx = 1;
5554           for (; CONSP (val); val = XCDR (val))
5555             {
5556               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5557               dim = CHARSET_DIMENSION (charset);
5558               while (idx < dim)
5559                 {
5560                   if (src == src_end)
5561                     goto too_short;
5562                   ONE_MORE_BYTE (c);
5563                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5564                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5565                     break;
5566                   idx++;
5567                 }
5568               if (idx == dim)
5569                 {
5570                   val = Qnil;
5571                   break;
5572                 }
5573             }
5574           if (CONSP (val))
5575             break;
5576         }
5577     }
5578  too_short:
5579   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5580   return 0;
5581
5582  no_more_source:
5583   detect_info->found |= found;
5584   return 1;
5585 }
5586
5587 static void
5588 decode_coding_charset (coding)
5589      struct coding_system *coding;
5590 {
5591   const unsigned char *src = coding->source + coding->consumed;
5592   const unsigned char *src_end = coding->source + coding->src_bytes;
5593   const unsigned char *src_base;
5594   int *charbuf = coding->charbuf + coding->charbuf_used;
5595   /* We may produce one charset annocation in one loop and one more at
5596      the end.  */
5597   int *charbuf_end
5598     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5599   int consumed_chars = 0, consumed_chars_base;
5600   int multibytep = coding->src_multibyte;
5601   Lisp_Object attrs, charset_list, valids;
5602   int char_offset = coding->produced_char;
5603   int last_offset = char_offset;
5604   int last_id = charset_ascii;
5605   int eol_crlf =
5606     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5607   int byte_after_cr = -1;
5608
5609   CODING_GET_INFO (coding, attrs, charset_list);
5610   valids = AREF (attrs, coding_attr_charset_valids);
5611
5612   while (1)
5613     {
5614       int c;
5615       Lisp_Object val;
5616       struct charset *charset;
5617       int dim;
5618       int len = 1;
5619       unsigned code;
5620
5621       src_base = src;
5622       consumed_chars_base = consumed_chars;
5623
5624       if (charbuf >= charbuf_end)
5625         {
5626           if (byte_after_cr >= 0)
5627             src_base--;
5628           break;
5629         }
5630
5631       if (byte_after_cr >= 0)
5632         {
5633           c = byte_after_cr;
5634           byte_after_cr = -1;
5635         }
5636       else
5637         {
5638           ONE_MORE_BYTE (c);
5639           if (eol_crlf && c == '\r')
5640             ONE_MORE_BYTE (byte_after_cr);
5641         }
5642       if (c < 0)
5643         goto invalid_code;
5644       code = c;
5645
5646       val = AREF (valids, c);
5647       if (! INTEGERP (val) && ! CONSP (val))
5648         goto invalid_code;
5649       if (INTEGERP (val))
5650         {
5651           charset = CHARSET_FROM_ID (XFASTINT (val));
5652           dim = CHARSET_DIMENSION (charset);
5653           while (len < dim)
5654             {
5655               ONE_MORE_BYTE (c);
5656               code = (code << 8) | c;
5657               len++;
5658             }
5659           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5660                               charset, code, c);
5661         }
5662       else
5663         {
5664           /* VAL is a list of charset IDs.  It is assured that the
5665              list is sorted by charset dimensions (smaller one
5666              comes first).  */
5667           while (CONSP (val))
5668             {
5669               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5670               dim = CHARSET_DIMENSION (charset);
5671               while (len < dim)
5672                 {
5673                   ONE_MORE_BYTE (c);
5674                   code = (code << 8) | c;
5675                   len++;
5676                 }
5677               CODING_DECODE_CHAR (coding, src, src_base,
5678                                   src_end, charset, code, c);
5679               if (c >= 0)
5680                 break;
5681               val = XCDR (val);
5682             }
5683         }
5684       if (c < 0)
5685         goto invalid_code;
5686       if (charset->id != charset_ascii
5687           && last_id != charset->id)
5688         {
5689           if (last_id != charset_ascii)
5690             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5691           last_id = charset->id;
5692           last_offset = char_offset;
5693         }
5694
5695       *charbuf++ = c;
5696       char_offset++;
5697       continue;
5698
5699     invalid_code:
5700       src = src_base;
5701       consumed_chars = consumed_chars_base;
5702       ONE_MORE_BYTE (c);
5703       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5704       char_offset++;
5705       coding->errors++;
5706     }
5707
5708  no_more_source:
5709   if (last_id != charset_ascii)
5710     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5711   coding->consumed_char += consumed_chars_base;
5712   coding->consumed = src_base - coding->source;
5713   coding->charbuf_used = charbuf - coding->charbuf;
5714 }
5715
5716 static int
5717 encode_coding_charset (coding)
5718      struct coding_system *coding;
5719 {
5720   int multibytep = coding->dst_multibyte;
5721   int *charbuf = coding->charbuf;
5722   int *charbuf_end = charbuf + coding->charbuf_used;
5723   unsigned char *dst = coding->destination + coding->produced;
5724   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5725   int safe_room = MAX_MULTIBYTE_LENGTH;
5726   int produced_chars = 0;
5727   Lisp_Object attrs, charset_list;
5728   int ascii_compatible;
5729   int c;
5730
5731   CODING_GET_INFO (coding, attrs, charset_list);
5732   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5733
5734   while (charbuf < charbuf_end)
5735     {
5736       struct charset *charset;
5737       unsigned code;
5738
5739       ASSURE_DESTINATION (safe_room);
5740       c = *charbuf++;
5741       if (ascii_compatible && ASCII_CHAR_P (c))
5742         EMIT_ONE_ASCII_BYTE (c);
5743       else if (CHAR_BYTE8_P (c))
5744         {
5745           c = CHAR_TO_BYTE8 (c);
5746           EMIT_ONE_BYTE (c);
5747         }
5748       else
5749         {
5750           charset = char_charset (c, charset_list, &code);
5751           if (charset)
5752             {
5753               if (CHARSET_DIMENSION (charset) == 1)
5754                 EMIT_ONE_BYTE (code);
5755               else if (CHARSET_DIMENSION (charset) == 2)
5756                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5757               else if (CHARSET_DIMENSION (charset) == 3)
5758                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5759               else
5760                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5761                                  (code >> 8) & 0xFF, code & 0xFF);
5762             }
5763           else
5764             {
5765               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5766                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5767               else
5768                 c = coding->default_char;
5769               EMIT_ONE_BYTE (c);
5770             }
5771         }
5772     }
5773
5774   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5775   coding->produced_char += produced_chars;
5776   coding->produced = dst - coding->destination;
5777   return 0;
5778 }
5779
5780 \f
5781 /*** 7. C library functions ***/
5782
5783 /* Setup coding context CODING from information about CODING_SYSTEM.
5784    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5785    CODING_SYSTEM is invalid, signal an error.  */
5786
5787 void
5788 setup_coding_system (coding_system, coding)
5789      Lisp_Object coding_system;
5790      struct coding_system *coding;
5791 {
5792   Lisp_Object attrs;
5793   Lisp_Object eol_type;
5794   Lisp_Object coding_type;
5795   Lisp_Object val;
5796
5797   if (NILP (coding_system))
5798     coding_system = Qundecided;
5799
5800   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5801
5802   attrs = CODING_ID_ATTRS (coding->id);
5803   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5804
5805   coding->mode = 0;
5806   coding->head_ascii = -1;
5807   if (VECTORP (eol_type))
5808     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5809                             | CODING_REQUIRE_DETECTION_MASK);
5810   else if (! EQ (eol_type, Qunix))
5811     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5812                             | CODING_REQUIRE_ENCODING_MASK);
5813   else
5814     coding->common_flags = 0;
5815   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5816     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5817   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5818     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5819   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5820     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5821
5822   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5823   coding->max_charset_id = SCHARS (val) - 1;
5824   coding->safe_charsets = SDATA (val);
5825   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5826   coding->carryover_bytes = 0;
5827
5828   coding_type = CODING_ATTR_TYPE (attrs);
5829   if (EQ (coding_type, Qundecided))
5830     {
5831       coding->detector = NULL;
5832       coding->decoder = decode_coding_raw_text;
5833       coding->encoder = encode_coding_raw_text;
5834       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5835     }
5836   else if (EQ (coding_type, Qiso_2022))
5837     {
5838       int i;
5839       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5840
5841       /* Invoke graphic register 0 to plane 0.  */
5842       CODING_ISO_INVOCATION (coding, 0) = 0;
5843       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5844       CODING_ISO_INVOCATION (coding, 1)
5845         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5846       /* Setup the initial status of designation.  */
5847       for (i = 0; i < 4; i++)
5848         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5849       /* Not single shifting initially.  */
5850       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5851       /* Beginning of buffer should also be regarded as bol. */
5852       CODING_ISO_BOL (coding) = 1;
5853       coding->detector = detect_coding_iso_2022;
5854       coding->decoder = decode_coding_iso_2022;
5855       coding->encoder = encode_coding_iso_2022;
5856       if (flags & CODING_ISO_FLAG_SAFE)
5857         coding->mode |= CODING_MODE_SAFE_ENCODING;
5858       coding->common_flags
5859         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5860             | CODING_REQUIRE_FLUSHING_MASK);
5861       if (flags & CODING_ISO_FLAG_COMPOSITION)
5862         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5863       if (flags & CODING_ISO_FLAG_DESIGNATION)
5864         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5865       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5866         {
5867           setup_iso_safe_charsets (attrs);
5868           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5869           coding->max_charset_id = SCHARS (val) - 1;
5870           coding->safe_charsets = SDATA (val);
5871         }
5872       CODING_ISO_FLAGS (coding) = flags;
5873       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5874       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5875       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5876       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5877     }
5878   else if (EQ (coding_type, Qcharset))
5879     {
5880       coding->detector = detect_coding_charset;
5881       coding->decoder = decode_coding_charset;
5882       coding->encoder = encode_coding_charset;
5883       coding->common_flags
5884         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5885     }
5886   else if (EQ (coding_type, Qutf_8))
5887     {
5888       val = AREF (attrs, coding_attr_utf_bom);
5889       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5890                                    : EQ (val, Qt) ? utf_with_bom
5891                                    : utf_without_bom);
5892       coding->detector = detect_coding_utf_8;
5893       coding->decoder = decode_coding_utf_8;
5894       coding->encoder = encode_coding_utf_8;
5895       coding->common_flags
5896         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5897       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5898         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5899     }
5900   else if (EQ (coding_type, Qutf_16))
5901     {
5902       val = AREF (attrs, coding_attr_utf_bom);
5903       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5904                                     : EQ (val, Qt) ? utf_with_bom
5905                                     : utf_without_bom);
5906       val = AREF (attrs, coding_attr_utf_16_endian);
5907       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5908                                        : utf_16_little_endian);
5909       CODING_UTF_16_SURROGATE (coding) = 0;
5910       coding->detector = detect_coding_utf_16;
5911       coding->decoder = decode_coding_utf_16;
5912       coding->encoder = encode_coding_utf_16;
5913       coding->common_flags
5914         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5915       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5916         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5917     }
5918   else if (EQ (coding_type, Qccl))
5919     {
5920       coding->detector = detect_coding_ccl;
5921       coding->decoder = decode_coding_ccl;
5922       coding->encoder = encode_coding_ccl;
5923       coding->common_flags
5924         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5925             | CODING_REQUIRE_FLUSHING_MASK);
5926     }
5927   else if (EQ (coding_type, Qemacs_mule))
5928     {
5929       coding->detector = detect_coding_emacs_mule;
5930       coding->decoder = decode_coding_emacs_mule;
5931       coding->encoder = encode_coding_emacs_mule;
5932       coding->common_flags
5933         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5934       coding->spec.emacs_mule.full_support = 1;
5935       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5936           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5937         {
5938           Lisp_Object tail, safe_charsets;
5939           int max_charset_id = 0;
5940
5941           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5942                tail = XCDR (tail))
5943             if (max_charset_id < XFASTINT (XCAR (tail)))
5944               max_charset_id = XFASTINT (XCAR (tail));
5945           safe_charsets = make_uninit_string (max_charset_id + 1);
5946           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5947           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5948                tail = XCDR (tail))
5949             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5950           coding->max_charset_id = max_charset_id;
5951           coding->safe_charsets = SDATA (safe_charsets);
5952           coding->spec.emacs_mule.full_support = 1;
5953         }
5954       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5955       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5956     }
5957   else if (EQ (coding_type, Qshift_jis))
5958     {
5959       coding->detector = detect_coding_sjis;
5960       coding->decoder = decode_coding_sjis;
5961       coding->encoder = encode_coding_sjis;
5962       coding->common_flags
5963         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5964     }
5965   else if (EQ (coding_type, Qbig5))
5966     {
5967       coding->detector = detect_coding_big5;
5968       coding->decoder = decode_coding_big5;
5969       coding->encoder = encode_coding_big5;
5970       coding->common_flags
5971         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5972     }
5973   else                          /* EQ (coding_type, Qraw_text) */
5974     {
5975       coding->detector = NULL;
5976       coding->decoder = decode_coding_raw_text;
5977       coding->encoder = encode_coding_raw_text;
5978       if (! EQ (eol_type, Qunix))
5979         {
5980           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5981           if (! VECTORP (eol_type))
5982             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5983         }
5984
5985     }
5986
5987   return;
5988 }
5989
5990 /* Return a list of charsets supported by CODING.  */
5991
5992 Lisp_Object
5993 coding_charset_list (coding)
5994      struct coding_system *coding;
5995 {
5996   Lisp_Object attrs, charset_list;
5997
5998   CODING_GET_INFO (coding, attrs, charset_list);
5999   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6000     {
6001       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6002
6003       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6004         charset_list = Viso_2022_charset_list;
6005     }
6006   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6007     {
6008       charset_list = Vemacs_mule_charset_list;
6009     }
6010   return charset_list;
6011 }
6012
6013
6014 /* Return a list of charsets supported by CODING-SYSTEM.  */
6015
6016 Lisp_Object
6017 coding_system_charset_list (coding_system)
6018      Lisp_Object coding_system;
6019 {
6020   int id;
6021   Lisp_Object attrs, charset_list;
6022
6023   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6024   attrs = CODING_ID_ATTRS (id);
6025
6026   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6027     {
6028       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6029
6030       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6031         charset_list = Viso_2022_charset_list;
6032       else
6033         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6034     }
6035   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6036     {
6037       charset_list = Vemacs_mule_charset_list;
6038     }
6039   else
6040     {
6041       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6042     }
6043   return charset_list;
6044 }
6045
6046
6047 /* Return raw-text or one of its subsidiaries that has the same
6048    eol_type as CODING-SYSTEM.  */
6049
6050 Lisp_Object
6051 raw_text_coding_system (coding_system)
6052      Lisp_Object coding_system;
6053 {
6054   Lisp_Object spec, attrs;
6055   Lisp_Object eol_type, raw_text_eol_type;
6056
6057   if (NILP (coding_system))
6058     return Qraw_text;
6059   spec = CODING_SYSTEM_SPEC (coding_system);
6060   attrs = AREF (spec, 0);
6061
6062   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6063     return coding_system;
6064
6065   eol_type = AREF (spec, 2);
6066   if (VECTORP (eol_type))
6067     return Qraw_text;
6068   spec = CODING_SYSTEM_SPEC (Qraw_text);
6069   raw_text_eol_type = AREF (spec, 2);
6070   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6071           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6072           : AREF (raw_text_eol_type, 2));
6073 }
6074
6075
6076 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6077    the subsidiary that has the same eol-spec as PARENT (if it is not
6078    nil and specifies end-of-line format) or the system's setting
6079    (system_eol_type).  */
6080
6081 Lisp_Object
6082 coding_inherit_eol_type (coding_system, parent)
6083      Lisp_Object coding_system, parent;
6084 {
6085   Lisp_Object spec, eol_type;
6086
6087   if (NILP (coding_system))
6088     coding_system = Qraw_text;
6089   spec = CODING_SYSTEM_SPEC (coding_system);
6090   eol_type = AREF (spec, 2);
6091   if (VECTORP (eol_type))
6092     {
6093       Lisp_Object parent_eol_type;
6094
6095       if (! NILP (parent))
6096         {
6097           Lisp_Object parent_spec;
6098
6099           parent_spec = CODING_SYSTEM_SPEC (parent);
6100           parent_eol_type = AREF (parent_spec, 2);
6101           if (VECTORP (parent_eol_type))
6102             parent_eol_type = system_eol_type;
6103         }
6104       else
6105         parent_eol_type = system_eol_type;
6106       if (EQ (parent_eol_type, Qunix))
6107         coding_system = AREF (eol_type, 0);
6108       else if (EQ (parent_eol_type, Qdos))
6109         coding_system = AREF (eol_type, 1);
6110       else if (EQ (parent_eol_type, Qmac))
6111         coding_system = AREF (eol_type, 2);
6112     }
6113   return coding_system;
6114 }
6115
6116
6117 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6118    decided for writing to a process.  If not, complement them, and
6119    return a new coding system.  */
6120
6121 Lisp_Object
6122 complement_process_encoding_system (coding_system)
6123      Lisp_Object coding_system;
6124 {
6125   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6126   Lisp_Object spec, attrs;
6127
6128   if (NILP (coding_system))
6129     coding_system = Qundecided;
6130   spec = CODING_SYSTEM_SPEC (coding_system);
6131   attrs = AREF (spec, 0);
6132   if (! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6133     coding_base = CODING_ATTR_BASE_NAME (attrs);
6134   if (! VECTORP (AREF (spec, 2)))
6135     eol_base = coding_system;
6136
6137   if (NILP (coding_base))
6138     {
6139       /* We must decide the text-conversion part ar first.  */
6140       if (CONSP (Vdefault_process_coding_system)
6141           && ! NILP (XCDR (Vdefault_process_coding_system)))
6142         {
6143           coding_system = XCDR (Vdefault_process_coding_system);
6144           spec = CODING_SYSTEM_SPEC (coding_system);
6145           attrs = AREF (spec, 0);
6146           if (! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6147             coding_base = CODING_ATTR_BASE_NAME (attrs);
6148           if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6149             eol_base = coding_system;
6150         }
6151       if (NILP (coding_base))
6152         {
6153           coding_system = preferred_coding_system ();
6154           spec = CODING_SYSTEM_SPEC (coding_system);
6155           attrs = AREF (spec, 0);
6156           if (! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6157             coding_base = CODING_ATTR_BASE_NAME (attrs);
6158           if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6159             eol_base = coding_system;
6160         }
6161       if (NILP (coding_base))
6162         {
6163           spec = CODING_SYSTEM_SPEC (Qraw_text);
6164           attrs = AREF (spec, 0);
6165           if (! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6166             coding_base = CODING_ATTR_BASE_NAME (attrs);
6167           if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6168             eol_base = coding_system;
6169         }
6170     }
6171
6172   /* We must decide the eol-conversion part (if not yet done).  */
6173   return coding_inherit_eol_type (coding_base, eol_base);
6174 }
6175
6176
6177 /* Emacs has a mechanism to automatically detect a coding system if it
6178    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6179    it's impossible to distinguish some coding systems accurately
6180    because they use the same range of codes.  So, at first, coding
6181    systems are categorized into 7, those are:
6182
6183    o coding-category-emacs-mule
6184
6185         The category for a coding system which has the same code range
6186         as Emacs' internal format.  Assigned the coding-system (Lisp
6187         symbol) `emacs-mule' by default.
6188
6189    o coding-category-sjis
6190
6191         The category for a coding system which has the same code range
6192         as SJIS.  Assigned the coding-system (Lisp
6193         symbol) `japanese-shift-jis' by default.
6194
6195    o coding-category-iso-7
6196
6197         The category for a coding system which has the same code range
6198         as ISO2022 of 7-bit environment.  This doesn't use any locking
6199         shift and single shift functions.  This can encode/decode all
6200         charsets.  Assigned the coding-system (Lisp symbol)
6201         `iso-2022-7bit' by default.
6202
6203    o coding-category-iso-7-tight
6204
6205         Same as coding-category-iso-7 except that this can
6206         encode/decode only the specified charsets.
6207
6208    o coding-category-iso-8-1
6209
6210         The category for a coding system which has the same code range
6211         as ISO2022 of 8-bit environment and graphic plane 1 used only
6212         for DIMENSION1 charset.  This doesn't use any locking shift
6213         and single shift functions.  Assigned the coding-system (Lisp
6214         symbol) `iso-latin-1' by default.
6215
6216    o coding-category-iso-8-2
6217
6218         The category for a coding system which has the same code range
6219         as ISO2022 of 8-bit environment and graphic plane 1 used only
6220         for DIMENSION2 charset.  This doesn't use any locking shift
6221         and single shift functions.  Assigned the coding-system (Lisp
6222         symbol) `japanese-iso-8bit' by default.
6223
6224    o coding-category-iso-7-else
6225
6226         The category for a coding system which has the same code range
6227         as ISO2022 of 7-bit environemnt but uses locking shift or
6228         single shift functions.  Assigned the coding-system (Lisp
6229         symbol) `iso-2022-7bit-lock' by default.
6230
6231    o coding-category-iso-8-else
6232
6233         The category for a coding system which has the same code range
6234         as ISO2022 of 8-bit environemnt but uses locking shift or
6235         single shift functions.  Assigned the coding-system (Lisp
6236         symbol) `iso-2022-8bit-ss2' by default.
6237
6238    o coding-category-big5
6239
6240         The category for a coding system which has the same code range
6241         as BIG5.  Assigned the coding-system (Lisp symbol)
6242         `cn-big5' by default.
6243
6244    o coding-category-utf-8
6245
6246         The category for a coding system which has the same code range
6247         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6248         symbol) `utf-8' by default.
6249
6250    o coding-category-utf-16-be
6251
6252         The category for a coding system in which a text has an
6253         Unicode signature (cf. Unicode Standard) in the order of BIG
6254         endian at the head.  Assigned the coding-system (Lisp symbol)
6255         `utf-16-be' by default.
6256
6257    o coding-category-utf-16-le
6258
6259         The category for a coding system in which a text has an
6260         Unicode signature (cf. Unicode Standard) in the order of
6261         LITTLE endian at the head.  Assigned the coding-system (Lisp
6262         symbol) `utf-16-le' by default.
6263
6264    o coding-category-ccl
6265
6266         The category for a coding system of which encoder/decoder is
6267         written in CCL programs.  The default value is nil, i.e., no
6268         coding system is assigned.
6269
6270    o coding-category-binary
6271
6272         The category for a coding system not categorized in any of the
6273         above.  Assigned the coding-system (Lisp symbol)
6274         `no-conversion' by default.
6275
6276    Each of them is a Lisp symbol and the value is an actual
6277    `coding-system's (this is also a Lisp symbol) assigned by a user.
6278    What Emacs does actually is to detect a category of coding system.
6279    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6280    decide only one possible category, it selects a category of the
6281    highest priority.  Priorities of categories are also specified by a
6282    user in a Lisp variable `coding-category-list'.
6283
6284 */
6285
6286 #define EOL_SEEN_NONE   0
6287 #define EOL_SEEN_LF     1
6288 #define EOL_SEEN_CR     2
6289 #define EOL_SEEN_CRLF   4
6290
6291 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6292    SOURCE is encoded.  If CATEGORY is one of
6293    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6294    two-byte, else they are encoded by one-byte.
6295
6296    Return one of EOL_SEEN_XXX.  */
6297
6298 #define MAX_EOL_CHECK_COUNT 3
6299
6300 static int
6301 detect_eol (source, src_bytes, category)
6302      const unsigned char *source;
6303      EMACS_INT src_bytes;
6304      enum coding_category category;
6305 {
6306   const unsigned char *src = source, *src_end = src + src_bytes;
6307   unsigned char c;
6308   int total  = 0;
6309   int eol_seen = EOL_SEEN_NONE;
6310
6311   if ((1 << category) & CATEGORY_MASK_UTF_16)
6312     {
6313       int msb, lsb;
6314
6315       msb = category == (coding_category_utf_16_le
6316                          | coding_category_utf_16_le_nosig);
6317       lsb = 1 - msb;
6318
6319       while (src + 1 < src_end)
6320         {
6321           c = src[lsb];
6322           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6323             {
6324               int this_eol;
6325
6326               if (c == '\n')
6327                 this_eol = EOL_SEEN_LF;
6328               else if (src + 3 >= src_end
6329                        || src[msb + 2] != 0
6330                        || src[lsb + 2] != '\n')
6331                 this_eol = EOL_SEEN_CR;
6332               else
6333                 {
6334                   this_eol = EOL_SEEN_CRLF;
6335                   src += 2;
6336                 }
6337
6338               if (eol_seen == EOL_SEEN_NONE)
6339                 /* This is the first end-of-line.  */
6340                 eol_seen = this_eol;
6341               else if (eol_seen != this_eol)
6342                 {
6343                   /* The found type is different from what found before.
6344                      Allow for stray ^M characters in DOS EOL files.  */
6345                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6346                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6347                     eol_seen = EOL_SEEN_CRLF;
6348                   else
6349                     {
6350                       eol_seen = EOL_SEEN_LF;
6351                       break;
6352                     }
6353                 }
6354               if (++total == MAX_EOL_CHECK_COUNT)
6355                 break;
6356             }
6357           src += 2;
6358         }
6359     }
6360   else
6361     {
6362       while (src < src_end)
6363         {
6364           c = *src++;
6365           if (c == '\n' || c == '\r')
6366             {
6367               int this_eol;
6368
6369               if (c == '\n')
6370                 this_eol = EOL_SEEN_LF;
6371               else if (src >= src_end || *src != '\n')
6372                 this_eol = EOL_SEEN_CR;
6373               else
6374                 this_eol = EOL_SEEN_CRLF, src++;
6375
6376               if (eol_seen == EOL_SEEN_NONE)
6377                 /* This is the first end-of-line.  */
6378                 eol_seen = this_eol;
6379               else if (eol_seen != this_eol)
6380                 {
6381                   /* The found type is different from what found before.
6382                      Allow for stray ^M characters in DOS EOL files.  */
6383                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6384                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6385                     eol_seen = EOL_SEEN_CRLF;
6386                   else
6387                     {
6388                       eol_seen = EOL_SEEN_LF;
6389                       break;
6390                     }
6391                 }
6392               if (++total == MAX_EOL_CHECK_COUNT)
6393                 break;
6394             }
6395         }
6396     }
6397   return eol_seen;
6398 }
6399
6400
6401 static Lisp_Object
6402 adjust_coding_eol_type (coding, eol_seen)
6403      struct coding_system *coding;
6404      int eol_seen;
6405 {
6406   Lisp_Object eol_type;
6407
6408   eol_type = CODING_ID_EOL_TYPE (coding->id);
6409   if (eol_seen & EOL_SEEN_LF)
6410     {
6411       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6412       eol_type = Qunix;
6413     }
6414   else if (eol_seen & EOL_SEEN_CRLF)
6415     {
6416       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6417       eol_type = Qdos;
6418     }
6419   else if (eol_seen & EOL_SEEN_CR)
6420     {
6421       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6422       eol_type = Qmac;
6423     }
6424   return eol_type;
6425 }
6426
6427 /* Detect how a text specified in CODING is encoded.  If a coding
6428    system is detected, update fields of CODING by the detected coding
6429    system.  */
6430
6431 void
6432 detect_coding (coding)
6433      struct coding_system *coding;
6434 {
6435   const unsigned char *src, *src_end;
6436   int saved_mode = coding->mode;
6437
6438   coding->consumed = coding->consumed_char = 0;
6439   coding->produced = coding->produced_char = 0;
6440   coding_set_source (coding);
6441
6442   src_end = coding->source + coding->src_bytes;
6443   coding->head_ascii = 0;
6444
6445   /* If we have not yet decided the text encoding type, detect it
6446      now.  */
6447   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6448     {
6449       int c, i;
6450       struct coding_detection_info detect_info;
6451       int null_byte_found = 0, eight_bit_found = 0;
6452
6453       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6454       for (src = coding->source; src < src_end; src++)
6455         {
6456           c = *src;
6457           if (c & 0x80)
6458             {
6459               eight_bit_found = 1;
6460               if (null_byte_found)
6461                 break;
6462             }
6463           else if (c < 0x20)
6464             {
6465               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6466                   && ! inhibit_iso_escape_detection
6467                   && ! detect_info.checked)
6468                 {
6469                   if (detect_coding_iso_2022 (coding, &detect_info))
6470                     {
6471                       /* We have scanned the whole data.  */
6472                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6473                         {
6474                           /* We didn't find an 8-bit code.  We may
6475                              have found a null-byte, but it's very
6476                              rare that a binary file confirm to
6477                              ISO-2022.  */
6478                           src = src_end;
6479                           coding->head_ascii = src - coding->source;
6480                         }
6481                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6482                       break;
6483                     }
6484                 }
6485               else if (! c && !inhibit_null_byte_detection)
6486                 {
6487                   null_byte_found = 1;
6488                   if (eight_bit_found)
6489                     break;
6490                 }
6491               if (! eight_bit_found)
6492                 coding->head_ascii++;
6493             }
6494           else if (! eight_bit_found)
6495             coding->head_ascii++;
6496         }
6497
6498       if (null_byte_found || eight_bit_found
6499           || coding->head_ascii < coding->src_bytes
6500           || detect_info.found)
6501         {
6502           enum coding_category category;
6503           struct coding_system *this;
6504
6505           if (coding->head_ascii == coding->src_bytes)
6506             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6507             for (i = 0; i < coding_category_raw_text; i++)
6508               {
6509                 category = coding_priorities[i];
6510                 this = coding_categories + category;
6511                 if (detect_info.found & (1 << category))
6512                   break;
6513               }
6514           else
6515             {
6516               if (null_byte_found)
6517                 {
6518                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6519                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6520                 }
6521               for (i = 0; i < coding_category_raw_text; i++)
6522                 {
6523                   category = coding_priorities[i];
6524                   this = coding_categories + category;
6525                   if (this->id < 0)
6526                     {
6527                       /* No coding system of this category is defined.  */
6528                       detect_info.rejected |= (1 << category);
6529                     }
6530                   else if (category >= coding_category_raw_text)
6531                     continue;
6532                   else if (detect_info.checked & (1 << category))
6533                     {
6534                       if (detect_info.found & (1 << category))
6535                         break;
6536                     }
6537                   else if ((*(this->detector)) (coding, &detect_info)
6538                            && detect_info.found & (1 << category))
6539                     {
6540                       if (category == coding_category_utf_16_auto)
6541                         {
6542                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6543                             category = coding_category_utf_16_le;
6544                           else
6545                             category = coding_category_utf_16_be;
6546                         }
6547                       break;
6548                     }
6549                 }
6550             }
6551
6552           if (i < coding_category_raw_text)
6553             setup_coding_system (CODING_ID_NAME (this->id), coding);
6554           else if (null_byte_found)
6555             setup_coding_system (Qno_conversion, coding);
6556           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6557                    == CATEGORY_MASK_ANY)
6558             setup_coding_system (Qraw_text, coding);
6559           else if (detect_info.rejected)
6560             for (i = 0; i < coding_category_raw_text; i++)
6561               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6562                 {
6563                   this = coding_categories + coding_priorities[i];
6564                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6565                   break;
6566                 }
6567         }
6568     }
6569   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6570            == coding_category_utf_8_auto)
6571     {
6572       Lisp_Object coding_systems;
6573       struct coding_detection_info detect_info;
6574
6575       coding_systems
6576         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6577       detect_info.found = detect_info.rejected = 0;
6578       coding->head_ascii = 0;
6579       if (CONSP (coding_systems)
6580           && detect_coding_utf_8 (coding, &detect_info))
6581         {
6582           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6583             setup_coding_system (XCAR (coding_systems), coding);
6584           else
6585             setup_coding_system (XCDR (coding_systems), coding);
6586         }
6587     }
6588   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6589            == coding_category_utf_16_auto)
6590     {
6591       Lisp_Object coding_systems;
6592       struct coding_detection_info detect_info;
6593
6594       coding_systems
6595         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6596       detect_info.found = detect_info.rejected = 0;
6597       coding->head_ascii = 0;
6598       if (CONSP (coding_systems)
6599           && detect_coding_utf_16 (coding, &detect_info))
6600         {
6601           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6602             setup_coding_system (XCAR (coding_systems), coding);
6603           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6604             setup_coding_system (XCDR (coding_systems), coding);
6605         }
6606     }
6607   coding->mode = saved_mode;
6608 }
6609
6610
6611 static void
6612 decode_eol (coding)
6613      struct coding_system *coding;
6614 {
6615   Lisp_Object eol_type;
6616   unsigned char *p, *pbeg, *pend;
6617
6618   eol_type = CODING_ID_EOL_TYPE (coding->id);
6619   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6620     return;
6621
6622   if (NILP (coding->dst_object))
6623     pbeg = coding->destination;
6624   else
6625     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6626   pend = pbeg + coding->produced;
6627
6628   if (VECTORP (eol_type))
6629     {
6630       int eol_seen = EOL_SEEN_NONE;
6631
6632       for (p = pbeg; p < pend; p++)
6633         {
6634           if (*p == '\n')
6635             eol_seen |= EOL_SEEN_LF;
6636           else if (*p == '\r')
6637             {
6638               if (p + 1 < pend && *(p + 1) == '\n')
6639                 {
6640                   eol_seen |= EOL_SEEN_CRLF;
6641                   p++;
6642                 }
6643               else
6644                 eol_seen |= EOL_SEEN_CR;
6645             }
6646         }
6647       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6648       if ((eol_seen & EOL_SEEN_CRLF) != 0
6649           && (eol_seen & EOL_SEEN_CR) != 0
6650           && (eol_seen & EOL_SEEN_LF) == 0)
6651         eol_seen = EOL_SEEN_CRLF;
6652       else if (eol_seen != EOL_SEEN_NONE
6653           && eol_seen != EOL_SEEN_LF
6654           && eol_seen != EOL_SEEN_CRLF
6655           && eol_seen != EOL_SEEN_CR)
6656         eol_seen = EOL_SEEN_LF;
6657       if (eol_seen != EOL_SEEN_NONE)
6658         eol_type = adjust_coding_eol_type (coding, eol_seen);
6659     }
6660
6661   if (EQ (eol_type, Qmac))
6662     {
6663       for (p = pbeg; p < pend; p++)
6664         if (*p == '\r')
6665           *p = '\n';
6666     }
6667   else if (EQ (eol_type, Qdos))
6668     {
6669       int n = 0;
6670
6671       if (NILP (coding->dst_object))
6672         {
6673           /* Start deleting '\r' from the tail to minimize the memory
6674              movement.  */
6675           for (p = pend - 2; p >= pbeg; p--)
6676             if (*p == '\r')
6677               {
6678                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6679                 n++;
6680               }
6681         }
6682       else
6683         {
6684           int pos_byte = coding->dst_pos_byte;
6685           int pos = coding->dst_pos;
6686           int pos_end = pos + coding->produced_char - 1;
6687
6688           while (pos < pos_end)
6689             {
6690               p = BYTE_POS_ADDR (pos_byte);
6691               if (*p == '\r' && p[1] == '\n')
6692                 {
6693                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6694                   n++;
6695                   pos_end--;
6696                 }
6697               pos++;
6698               if (coding->dst_multibyte)
6699                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6700               else
6701                 pos_byte++;
6702             }
6703         }
6704       coding->produced -= n;
6705       coding->produced_char -= n;
6706     }
6707 }
6708
6709
6710 /* Return a translation table (or list of them) from coding system
6711    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6712    decoding (ENCODEP is zero). */
6713
6714 static Lisp_Object
6715 get_translation_table (attrs, encodep, max_lookup)
6716      Lisp_Object attrs;
6717      int encodep, *max_lookup;
6718 {
6719   Lisp_Object standard, translation_table;
6720   Lisp_Object val;
6721
6722   if (NILP (Venable_character_translation))
6723     {
6724       if (max_lookup)
6725         *max_lookup = 0;
6726       return Qnil;
6727     }
6728   if (encodep)
6729     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6730       standard = Vstandard_translation_table_for_encode;
6731   else
6732     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6733       standard = Vstandard_translation_table_for_decode;
6734   if (NILP (translation_table))
6735     translation_table = standard;
6736   else
6737     {
6738       if (SYMBOLP (translation_table))
6739         translation_table = Fget (translation_table, Qtranslation_table);
6740       else if (CONSP (translation_table))
6741         {
6742           translation_table = Fcopy_sequence (translation_table);
6743           for (val = translation_table; CONSP (val); val = XCDR (val))
6744             if (SYMBOLP (XCAR (val)))
6745               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6746         }
6747       if (CHAR_TABLE_P (standard))
6748         {
6749           if (CONSP (translation_table))
6750             translation_table = nconc2 (translation_table,
6751                                         Fcons (standard, Qnil));
6752           else
6753             translation_table = Fcons (translation_table,
6754                                        Fcons (standard, Qnil));
6755         }
6756     }
6757
6758   if (max_lookup)
6759     {
6760       *max_lookup = 1;
6761       if (CHAR_TABLE_P (translation_table)
6762           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6763         {
6764           val = XCHAR_TABLE (translation_table)->extras[1];
6765           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6766             *max_lookup = XFASTINT (val);
6767         }
6768       else if (CONSP (translation_table))
6769         {
6770           Lisp_Object tail, val;
6771
6772           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6773             if (CHAR_TABLE_P (XCAR (tail))
6774                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6775               {
6776                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6777                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6778                   *max_lookup = XFASTINT (val);
6779               }
6780         }
6781     }
6782   return translation_table;
6783 }
6784
6785 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6786   do {                                                          \
6787     trans = Qnil;                                               \
6788     if (CHAR_TABLE_P (table))                                   \
6789       {                                                         \
6790         trans = CHAR_TABLE_REF (table, c);                      \
6791         if (CHARACTERP (trans))                                 \
6792           c = XFASTINT (trans), trans = Qnil;                   \
6793       }                                                         \
6794     else if (CONSP (table))                                     \
6795       {                                                         \
6796         Lisp_Object tail;                                       \
6797                                                                 \
6798         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6799           if (CHAR_TABLE_P (XCAR (tail)))                       \
6800             {                                                   \
6801               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6802               if (CHARACTERP (trans))                           \
6803                 c = XFASTINT (trans), trans = Qnil;             \
6804               else if (! NILP (trans))                          \
6805                 break;                                          \
6806             }                                                   \
6807       }                                                         \
6808   } while (0)
6809
6810
6811 /* Return a translation of character(s) at BUF according to TRANS.
6812    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6813    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6814    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6815    translation is found, and Qnil if not found..
6816    If BUF is too short to lookup characters in FROM, return Qt.  */
6817
6818 static Lisp_Object
6819 get_translation (trans, buf, buf_end)
6820      Lisp_Object trans;
6821      int *buf, *buf_end;
6822 {
6823
6824   if (INTEGERP (trans))
6825     return trans;
6826   for (; CONSP (trans); trans = XCDR (trans))
6827     {
6828       Lisp_Object val = XCAR (trans);
6829       Lisp_Object from = XCAR (val);
6830       int len = ASIZE (from);
6831       int i;
6832
6833       for (i = 0; i < len; i++)
6834         {
6835           if (buf + i == buf_end)
6836             return Qt;
6837           if (XINT (AREF (from, i)) != buf[i])
6838             break;
6839         }
6840       if (i == len)
6841         return val;
6842     }
6843   return Qnil;
6844 }
6845
6846
6847 static int
6848 produce_chars (coding, translation_table, last_block)
6849      struct coding_system *coding;
6850      Lisp_Object translation_table;
6851      int last_block;
6852 {
6853   unsigned char *dst = coding->destination + coding->produced;
6854   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6855   EMACS_INT produced;
6856   EMACS_INT produced_chars = 0;
6857   int carryover = 0;
6858
6859   if (! coding->chars_at_source)
6860     {
6861       /* Source characters are in coding->charbuf.  */
6862       int *buf = coding->charbuf;
6863       int *buf_end = buf + coding->charbuf_used;
6864
6865       if (EQ (coding->src_object, coding->dst_object))
6866         {
6867           coding_set_source (coding);
6868           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6869         }
6870
6871       while (buf < buf_end)
6872         {
6873           int c = *buf, i;
6874
6875           if (c >= 0)
6876             {
6877               int from_nchars = 1, to_nchars = 1;
6878               Lisp_Object trans = Qnil;
6879
6880               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6881               if (! NILP (trans))
6882                 {
6883                   trans = get_translation (trans, buf, buf_end);
6884                   if (INTEGERP (trans))
6885                     c = XINT (trans);
6886                   else if (CONSP (trans))
6887                     {
6888                       from_nchars = ASIZE (XCAR (trans));
6889                       trans = XCDR (trans);
6890                       if (INTEGERP (trans))
6891                         c = XINT (trans);
6892                       else
6893                         {
6894                           to_nchars = ASIZE (trans);
6895                           c = XINT (AREF (trans, 0));
6896                         }
6897                     }
6898                   else if (EQ (trans, Qt) && ! last_block)
6899                     break;
6900                 }
6901
6902               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6903                 {
6904                   dst = alloc_destination (coding,
6905                                            buf_end - buf
6906                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6907                                            dst);
6908                   if (EQ (coding->src_object, coding->dst_object))
6909                     {
6910                       coding_set_source (coding);
6911                       dst_end = (((unsigned char *) coding->source)
6912                                  + coding->consumed);
6913                     }
6914                   else
6915                     dst_end = coding->destination + coding->dst_bytes;
6916                 }
6917
6918               for (i = 0; i < to_nchars; i++)
6919                 {
6920                   if (i > 0)
6921                     c = XINT (AREF (trans, i));
6922                   if (coding->dst_multibyte
6923                       || ! CHAR_BYTE8_P (c))
6924                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6925                   else
6926                     *dst++ = CHAR_TO_BYTE8 (c);
6927                 }
6928               produced_chars += to_nchars;
6929               buf += from_nchars;
6930             }
6931           else
6932             /* This is an annotation datum.  (-C) is the length.  */
6933             buf += -c;
6934         }
6935       carryover = buf_end - buf;
6936     }
6937   else
6938     {
6939       /* Source characters are at coding->source.  */
6940       const unsigned char *src = coding->source;
6941       const unsigned char *src_end = src + coding->consumed;
6942
6943       if (EQ (coding->dst_object, coding->src_object))
6944         dst_end = (unsigned char *) src;
6945       if (coding->src_multibyte != coding->dst_multibyte)
6946         {
6947           if (coding->src_multibyte)
6948             {
6949               int multibytep = 1;
6950               EMACS_INT consumed_chars = 0;
6951
6952               while (1)
6953                 {
6954                   const unsigned char *src_base = src;
6955                   int c;
6956
6957                   ONE_MORE_BYTE (c);
6958                   if (dst == dst_end)
6959                     {
6960                       if (EQ (coding->src_object, coding->dst_object))
6961                         dst_end = (unsigned char *) src;
6962                       if (dst == dst_end)
6963                         {
6964                           EMACS_INT offset = src - coding->source;
6965
6966                           dst = alloc_destination (coding, src_end - src + 1,
6967                                                    dst);
6968                           dst_end = coding->destination + coding->dst_bytes;
6969                           coding_set_source (coding);
6970                           src = coding->source + offset;
6971                           src_end = coding->source + coding->src_bytes;
6972                           if (EQ (coding->src_object, coding->dst_object))
6973                             dst_end = (unsigned char *) src;
6974                         }
6975                     }
6976                   *dst++ = c;
6977                   produced_chars++;
6978                 }
6979             no_more_source:
6980               ;
6981             }
6982           else
6983             while (src < src_end)
6984               {
6985                 int multibytep = 1;
6986                 int c = *src++;
6987
6988                 if (dst >= dst_end - 1)
6989                   {
6990                     if (EQ (coding->src_object, coding->dst_object))
6991                       dst_end = (unsigned char *) src;
6992                     if (dst >= dst_end - 1)
6993                       {
6994                         EMACS_INT offset = src - coding->source;
6995                         EMACS_INT more_bytes;
6996
6997                         if (EQ (coding->src_object, coding->dst_object))
6998                           more_bytes = ((src_end - src) / 2) + 2;
6999                         else
7000                           more_bytes = src_end - src + 2;
7001                         dst = alloc_destination (coding, more_bytes, dst);
7002                         dst_end = coding->destination + coding->dst_bytes;
7003                         coding_set_source (coding);
7004                         src = coding->source + offset;
7005                         src_end = coding->source + coding->src_bytes;
7006                         if (EQ (coding->src_object, coding->dst_object))
7007                           dst_end = (unsigned char *) src;
7008                       }
7009                   }
7010                 EMIT_ONE_BYTE (c);
7011               }
7012         }
7013       else
7014         {
7015           if (!EQ (coding->src_object, coding->dst_object))
7016             {
7017               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
7018
7019               if (require > 0)
7020                 {
7021                   EMACS_INT offset = src - coding->source;
7022
7023                   dst = alloc_destination (coding, require, dst);
7024                   coding_set_source (coding);
7025                   src = coding->source + offset;
7026                   src_end = coding->source + coding->src_bytes;
7027                 }
7028             }
7029           produced_chars = coding->consumed_char;
7030           while (src < src_end)
7031             *dst++ = *src++;
7032         }
7033     }
7034
7035   produced = dst - (coding->destination + coding->produced);
7036   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7037     insert_from_gap (produced_chars, produced);
7038   coding->produced += produced;
7039   coding->produced_char += produced_chars;
7040   return carryover;
7041 }
7042
7043 /* Compose text in CODING->object according to the annotation data at
7044    CHARBUF.  CHARBUF is an array:
7045      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7046  */
7047
7048 static INLINE void
7049 produce_composition (coding, charbuf, pos)
7050      struct coding_system *coding;
7051      int *charbuf;
7052      EMACS_INT pos;
7053 {
7054   int len;
7055   EMACS_INT to;
7056   enum composition_method method;
7057   Lisp_Object components;
7058
7059   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7060   to = pos + charbuf[2];
7061   method = (enum composition_method) (charbuf[4]);
7062
7063   if (method == COMPOSITION_RELATIVE)
7064     components = Qnil;
7065   else
7066     {
7067       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7068       int i, j;
7069
7070       if (method == COMPOSITION_WITH_RULE)
7071         len = charbuf[2] * 3 - 2;
7072       charbuf += MAX_ANNOTATION_LENGTH;
7073       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7074       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7075         {
7076           if (charbuf[i] >= 0)
7077             args[j] = make_number (charbuf[i]);
7078           else
7079             {
7080               i++;
7081               args[j] = make_number (charbuf[i] % 0x100);
7082             }
7083         }
7084       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7085     }
7086   compose_text (pos, to, components, Qnil, coding->dst_object);
7087 }
7088
7089
7090 /* Put `charset' property on text in CODING->object according to
7091    the annotation data at CHARBUF.  CHARBUF is an array:
7092      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7093  */
7094
7095 static INLINE void
7096 produce_charset (coding, charbuf, pos)
7097      struct coding_system *coding;
7098      int *charbuf;
7099      EMACS_INT pos;
7100 {
7101   EMACS_INT from = pos - charbuf[2];
7102   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7103
7104   Fput_text_property (make_number (from), make_number (pos),
7105                       Qcharset, CHARSET_NAME (charset),
7106                       coding->dst_object);
7107 }
7108
7109
7110 #define CHARBUF_SIZE 0x4000
7111
7112 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7113   do {                                                                  \
7114     int size = CHARBUF_SIZE;                                            \
7115                                                                         \
7116     coding->charbuf = NULL;                                             \
7117     while (size > 1024)                                                 \
7118       {                                                                 \
7119         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7120         if (coding->charbuf)                                            \
7121           break;                                                        \
7122         size >>= 1;                                                     \
7123       }                                                                 \
7124     if (! coding->charbuf)                                              \
7125       {                                                                 \
7126         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7127         return coding->result;                                          \
7128       }                                                                 \
7129     coding->charbuf_size = size;                                        \
7130   } while (0)
7131
7132
7133 static void
7134 produce_annotation (coding, pos)
7135      struct coding_system *coding;
7136      EMACS_INT pos;
7137 {
7138   int *charbuf = coding->charbuf;
7139   int *charbuf_end = charbuf + coding->charbuf_used;
7140
7141   if (NILP (coding->dst_object))
7142     return;
7143
7144   while (charbuf < charbuf_end)
7145     {
7146       if (*charbuf >= 0)
7147         pos++, charbuf++;
7148       else
7149         {
7150           int len = -*charbuf;
7151
7152           if (len > 2)
7153             switch (charbuf[1])
7154               {
7155               case CODING_ANNOTATE_COMPOSITION_MASK:
7156                 produce_composition (coding, charbuf, pos);
7157                 break;
7158               case CODING_ANNOTATE_CHARSET_MASK:
7159                 produce_charset (coding, charbuf, pos);
7160                 break;
7161               }
7162           charbuf += len;
7163         }
7164     }
7165 }
7166
7167 /* Decode the data at CODING->src_object into CODING->dst_object.
7168    CODING->src_object is a buffer, a string, or nil.
7169    CODING->dst_object is a buffer.
7170
7171    If CODING->src_object is a buffer, it must be the current buffer.
7172    In this case, if CODING->src_pos is positive, it is a position of
7173    the source text in the buffer, otherwise, the source text is in the
7174    gap area of the buffer, and CODING->src_pos specifies the offset of
7175    the text from GPT (which must be the same as PT).  If this is the
7176    same buffer as CODING->dst_object, CODING->src_pos must be
7177    negative.
7178
7179    If CODING->src_object is a string, CODING->src_pos is an index to
7180    that string.
7181
7182    If CODING->src_object is nil, CODING->source must already point to
7183    the non-relocatable memory area.  In this case, CODING->src_pos is
7184    an offset from CODING->source.
7185
7186    The decoded data is inserted at the current point of the buffer
7187    CODING->dst_object.
7188 */
7189
7190 static int
7191 decode_coding (coding)
7192      struct coding_system *coding;
7193 {
7194   Lisp_Object attrs;
7195   Lisp_Object undo_list;
7196   Lisp_Object translation_table;
7197   struct ccl_spec cclspec;
7198   int carryover;
7199   int i;
7200
7201   if (BUFFERP (coding->src_object)
7202       && coding->src_pos > 0
7203       && coding->src_pos < GPT
7204       && coding->src_pos + coding->src_chars > GPT)
7205     move_gap_both (coding->src_pos, coding->src_pos_byte);
7206
7207   undo_list = Qt;
7208   if (BUFFERP (coding->dst_object))
7209     {
7210       if (current_buffer != XBUFFER (coding->dst_object))
7211         set_buffer_internal (XBUFFER (coding->dst_object));
7212       if (GPT != PT)
7213         move_gap_both (PT, PT_BYTE);
7214       undo_list = current_buffer->undo_list;
7215       current_buffer->undo_list = Qt;
7216     }
7217
7218   coding->consumed = coding->consumed_char = 0;
7219   coding->produced = coding->produced_char = 0;
7220   coding->chars_at_source = 0;
7221   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7222   coding->errors = 0;
7223
7224   ALLOC_CONVERSION_WORK_AREA (coding);
7225
7226   attrs = CODING_ID_ATTRS (coding->id);
7227   translation_table = get_translation_table (attrs, 0, NULL);
7228
7229   carryover = 0;
7230   if (coding->decoder == decode_coding_ccl)
7231     {
7232       coding->spec.ccl = &cclspec;
7233       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7234     }
7235   do
7236     {
7237       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7238
7239       coding_set_source (coding);
7240       coding->annotated = 0;
7241       coding->charbuf_used = carryover;
7242       (*(coding->decoder)) (coding);
7243       coding_set_destination (coding);
7244       carryover = produce_chars (coding, translation_table, 0);
7245       if (coding->annotated)
7246         produce_annotation (coding, pos);
7247       for (i = 0; i < carryover; i++)
7248         coding->charbuf[i]
7249           = coding->charbuf[coding->charbuf_used - carryover + i];
7250     }
7251   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7252          || (coding->consumed < coding->src_bytes
7253              && (coding->result == CODING_RESULT_SUCCESS
7254                  || coding->result == CODING_RESULT_INVALID_SRC)));
7255
7256   if (carryover > 0)
7257     {
7258       coding_set_destination (coding);
7259       coding->charbuf_used = carryover;
7260       produce_chars (coding, translation_table, 1);
7261     }
7262
7263   coding->carryover_bytes = 0;
7264   if (coding->consumed < coding->src_bytes)
7265     {
7266       int nbytes = coding->src_bytes - coding->consumed;
7267       const unsigned char *src;
7268
7269       coding_set_source (coding);
7270       coding_set_destination (coding);
7271       src = coding->source + coding->consumed;
7272
7273       if (coding->mode & CODING_MODE_LAST_BLOCK)
7274         {
7275           /* Flush out unprocessed data as binary chars.  We are sure
7276              that the number of data is less than the size of
7277              coding->charbuf.  */
7278           coding->charbuf_used = 0;
7279           coding->chars_at_source = 0;
7280
7281           while (nbytes-- > 0)
7282             {
7283               int c = *src++;
7284
7285               if (c & 0x80)
7286                 c = BYTE8_TO_CHAR (c);
7287               coding->charbuf[coding->charbuf_used++] = c;
7288             }
7289           produce_chars (coding, Qnil, 1);
7290         }
7291       else
7292         {
7293           /* Record unprocessed bytes in coding->carryover.  We are
7294              sure that the number of data is less than the size of
7295              coding->carryover.  */
7296           unsigned char *p = coding->carryover;
7297
7298           if (nbytes > sizeof coding->carryover)
7299             nbytes = sizeof coding->carryover;
7300           coding->carryover_bytes = nbytes;
7301           while (nbytes-- > 0)
7302             *p++ = *src++;
7303         }
7304       coding->consumed = coding->src_bytes;
7305     }
7306
7307   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7308       && !inhibit_eol_conversion)
7309     decode_eol (coding);
7310   if (BUFFERP (coding->dst_object))
7311     {
7312       current_buffer->undo_list = undo_list;
7313       record_insert (coding->dst_pos, coding->produced_char);
7314     }
7315   return coding->result;
7316 }
7317
7318
7319 /* Extract an annotation datum from a composition starting at POS and
7320    ending before LIMIT of CODING->src_object (buffer or string), store
7321    the data in BUF, set *STOP to a starting position of the next
7322    composition (if any) or to LIMIT, and return the address of the
7323    next element of BUF.
7324
7325    If such an annotation is not found, set *STOP to a starting
7326    position of a composition after POS (if any) or to LIMIT, and
7327    return BUF.  */
7328
7329 static INLINE int *
7330 handle_composition_annotation (pos, limit, coding, buf, stop)
7331      EMACS_INT pos, limit;
7332      struct coding_system *coding;
7333      int *buf;
7334      EMACS_INT *stop;
7335 {
7336   EMACS_INT start, end;
7337   Lisp_Object prop;
7338
7339   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7340       || end > limit)
7341     *stop = limit;
7342   else if (start > pos)
7343     *stop = start;
7344   else
7345     {
7346       if (start == pos)
7347         {
7348           /* We found a composition.  Store the corresponding
7349              annotation data in BUF.  */
7350           int *head = buf;
7351           enum composition_method method = COMPOSITION_METHOD (prop);
7352           int nchars = COMPOSITION_LENGTH (prop);
7353
7354           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7355           if (method != COMPOSITION_RELATIVE)
7356             {
7357               Lisp_Object components;
7358               int len, i, i_byte;
7359
7360               components = COMPOSITION_COMPONENTS (prop);
7361               if (VECTORP (components))
7362                 {
7363                   len = XVECTOR (components)->size;
7364                   for (i = 0; i < len; i++)
7365                     *buf++ = XINT (AREF (components, i));
7366                 }
7367               else if (STRINGP (components))
7368                 {
7369                   len = SCHARS (components);
7370                   i = i_byte = 0;
7371                   while (i < len)
7372                     {
7373                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7374                       buf++;
7375                     }
7376                 }
7377               else if (INTEGERP (components))
7378                 {
7379                   len = 1;
7380                   *buf++ = XINT (components);
7381                 }
7382               else if (CONSP (components))
7383                 {
7384                   for (len = 0; CONSP (components);
7385                        len++, components = XCDR (components))
7386                     *buf++ = XINT (XCAR (components));
7387                 }
7388               else
7389                 abort ();
7390               *head -= len;
7391             }
7392         }
7393
7394       if (find_composition (end, limit, &start, &end, &prop,
7395                             coding->src_object)
7396           && end <= limit)
7397         *stop = start;
7398       else
7399         *stop = limit;
7400     }
7401   return buf;
7402 }
7403
7404
7405 /* Extract an annotation datum from a text property `charset' at POS of
7406    CODING->src_object (buffer of string), store the data in BUF, set
7407    *STOP to the position where the value of `charset' property changes
7408    (limiting by LIMIT), and return the address of the next element of
7409    BUF.
7410
7411    If the property value is nil, set *STOP to the position where the
7412    property value is non-nil (limiting by LIMIT), and return BUF.  */
7413
7414 static INLINE int *
7415 handle_charset_annotation (pos, limit, coding, buf, stop)
7416      EMACS_INT pos, limit;
7417      struct coding_system *coding;
7418      int *buf;
7419      EMACS_INT *stop;
7420 {
7421   Lisp_Object val, next;
7422   int id;
7423
7424   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7425   if (! NILP (val) && CHARSETP (val))
7426     id = XINT (CHARSET_SYMBOL_ID (val));
7427   else
7428     id = -1;
7429   ADD_CHARSET_DATA (buf, 0, id);
7430   next = Fnext_single_property_change (make_number (pos), Qcharset,
7431                                        coding->src_object,
7432                                        make_number (limit));
7433   *stop = XINT (next);
7434   return buf;
7435 }
7436
7437
7438 static void
7439 consume_chars (coding, translation_table, max_lookup)
7440      struct coding_system *coding;
7441      Lisp_Object translation_table;
7442      int max_lookup;
7443 {
7444   int *buf = coding->charbuf;
7445   int *buf_end = coding->charbuf + coding->charbuf_size;
7446   const unsigned char *src = coding->source + coding->consumed;
7447   const unsigned char *src_end = coding->source + coding->src_bytes;
7448   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7449   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7450   int multibytep = coding->src_multibyte;
7451   Lisp_Object eol_type;
7452   int c;
7453   EMACS_INT stop, stop_composition, stop_charset;
7454   int *lookup_buf = NULL;
7455
7456   if (! NILP (translation_table))
7457     lookup_buf = alloca (sizeof (int) * max_lookup);
7458
7459   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7460   if (VECTORP (eol_type))
7461     eol_type = Qunix;
7462
7463   /* Note: composition handling is not yet implemented.  */
7464   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7465
7466   if (NILP (coding->src_object))
7467     stop = stop_composition = stop_charset = end_pos;
7468   else
7469     {
7470       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7471         stop = stop_composition = pos;
7472       else
7473         stop = stop_composition = end_pos;
7474       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7475         stop = stop_charset = pos;
7476       else
7477         stop_charset = end_pos;
7478     }
7479
7480   /* Compensate for CRLF and conversion.  */
7481   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7482   while (buf < buf_end)
7483     {
7484       Lisp_Object trans;
7485
7486       if (pos == stop)
7487         {
7488           if (pos == end_pos)
7489             break;
7490           if (pos == stop_composition)
7491             buf = handle_composition_annotation (pos, end_pos, coding,
7492                                                  buf, &stop_composition);
7493           if (pos == stop_charset)
7494             buf = handle_charset_annotation (pos, end_pos, coding,
7495                                              buf, &stop_charset);
7496           stop = (stop_composition < stop_charset
7497                   ? stop_composition : stop_charset);
7498         }
7499
7500       if (! multibytep)
7501         {
7502           EMACS_INT bytes;
7503
7504           if (coding->encoder == encode_coding_raw_text
7505               || coding->encoder == encode_coding_ccl)
7506             c = *src++, pos++;
7507           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7508             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7509           else
7510             c = BYTE8_TO_CHAR (*src), src++, pos++;
7511         }
7512       else
7513         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7514       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7515         c = '\n';
7516       if (! EQ (eol_type, Qunix))
7517         {
7518           if (c == '\n')
7519             {
7520               if (EQ (eol_type, Qdos))
7521                 *buf++ = '\r';
7522               else
7523                 c = '\r';
7524             }
7525         }
7526
7527       trans = Qnil;
7528       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7529       if (NILP (trans))
7530         *buf++ = c;
7531       else
7532         {
7533           int from_nchars = 1, to_nchars = 1;
7534           int *lookup_buf_end;
7535           const unsigned char *p = src;
7536           int i;
7537
7538           lookup_buf[0] = c;
7539           for (i = 1; i < max_lookup && p < src_end; i++)
7540             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7541           lookup_buf_end = lookup_buf + i;
7542           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7543           if (INTEGERP (trans))
7544             c = XINT (trans);
7545           else if (CONSP (trans))
7546             {
7547               from_nchars = ASIZE (XCAR (trans));
7548               trans = XCDR (trans);
7549               if (INTEGERP (trans))
7550                 c = XINT (trans);
7551               else
7552                 {
7553                   to_nchars = ASIZE (trans);
7554                   if (buf + to_nchars > buf_end)
7555                     break;
7556                   c = XINT (AREF (trans, 0));
7557                 }
7558             }
7559           else
7560             break;
7561           *buf++ = c;
7562           for (i = 1; i < to_nchars; i++)
7563             *buf++ = XINT (AREF (trans, i));
7564           for (i = 1; i < from_nchars; i++, pos++)
7565             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7566         }
7567     }
7568
7569   coding->consumed = src - coding->source;
7570   coding->consumed_char = pos - coding->src_pos;
7571   coding->charbuf_used = buf - coding->charbuf;
7572   coding->chars_at_source = 0;
7573 }
7574
7575
7576 /* Encode the text at CODING->src_object into CODING->dst_object.
7577    CODING->src_object is a buffer or a string.
7578    CODING->dst_object is a buffer or nil.
7579
7580    If CODING->src_object is a buffer, it must be the current buffer.
7581    In this case, if CODING->src_pos is positive, it is a position of
7582    the source text in the buffer, otherwise. the source text is in the
7583    gap area of the buffer, and coding->src_pos specifies the offset of
7584    the text from GPT (which must be the same as PT).  If this is the
7585    same buffer as CODING->dst_object, CODING->src_pos must be
7586    negative and CODING should not have `pre-write-conversion'.
7587
7588    If CODING->src_object is a string, CODING should not have
7589    `pre-write-conversion'.
7590
7591    If CODING->dst_object is a buffer, the encoded data is inserted at
7592    the current point of that buffer.
7593
7594    If CODING->dst_object is nil, the encoded data is placed at the
7595    memory area specified by CODING->destination.  */
7596
7597 static int
7598 encode_coding (coding)
7599      struct coding_system *coding;
7600 {
7601   Lisp_Object attrs;
7602   Lisp_Object translation_table;
7603   int max_lookup;
7604   struct ccl_spec cclspec;
7605
7606   attrs = CODING_ID_ATTRS (coding->id);
7607   if (coding->encoder == encode_coding_raw_text)
7608     translation_table = Qnil, max_lookup = 0;
7609   else
7610     translation_table = get_translation_table (attrs, 1, &max_lookup);
7611
7612   if (BUFFERP (coding->dst_object))
7613     {
7614       set_buffer_internal (XBUFFER (coding->dst_object));
7615       coding->dst_multibyte
7616         = ! NILP (current_buffer->enable_multibyte_characters);
7617     }
7618
7619   coding->consumed = coding->consumed_char = 0;
7620   coding->produced = coding->produced_char = 0;
7621   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7622   coding->errors = 0;
7623
7624   ALLOC_CONVERSION_WORK_AREA (coding);
7625
7626   if (coding->encoder == encode_coding_ccl)
7627     {
7628       coding->spec.ccl = &cclspec;
7629       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7630     }
7631   do {
7632     coding_set_source (coding);
7633     consume_chars (coding, translation_table, max_lookup);
7634     coding_set_destination (coding);
7635     (*(coding->encoder)) (coding);
7636   } while (coding->consumed_char < coding->src_chars);
7637
7638   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7639     insert_from_gap (coding->produced_char, coding->produced);
7640
7641   return (coding->result);
7642 }
7643
7644
7645 /* Name (or base name) of work buffer for code conversion.  */
7646 static Lisp_Object Vcode_conversion_workbuf_name;
7647
7648 /* A working buffer used by the top level conversion.  Once it is
7649    created, it is never destroyed.  It has the name
7650    Vcode_conversion_workbuf_name.  The other working buffers are
7651    destroyed after the use is finished, and their names are modified
7652    versions of Vcode_conversion_workbuf_name.  */
7653 static Lisp_Object Vcode_conversion_reused_workbuf;
7654
7655 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7656 static int reused_workbuf_in_use;
7657
7658
7659 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7660    multibyteness of returning buffer.  */
7661
7662 static Lisp_Object
7663 make_conversion_work_buffer (multibyte)
7664      int multibyte;
7665 {
7666   Lisp_Object name, workbuf;
7667   struct buffer *current;
7668
7669   if (reused_workbuf_in_use++)
7670     {
7671       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7672       workbuf = Fget_buffer_create (name);
7673     }
7674   else
7675     {
7676       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7677         Vcode_conversion_reused_workbuf
7678           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7679       workbuf = Vcode_conversion_reused_workbuf;
7680     }
7681   current = current_buffer;
7682   set_buffer_internal (XBUFFER (workbuf));
7683   /* We can't allow modification hooks to run in the work buffer.  For
7684      instance, directory_files_internal assumes that file decoding
7685      doesn't compile new regexps.  */
7686   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7687   Ferase_buffer ();
7688   current_buffer->undo_list = Qt;
7689   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7690   set_buffer_internal (current);
7691   return workbuf;
7692 }
7693
7694
7695 static Lisp_Object
7696 code_conversion_restore (arg)
7697      Lisp_Object arg;
7698 {
7699   Lisp_Object current, workbuf;
7700   struct gcpro gcpro1;
7701
7702   GCPRO1 (arg);
7703   current = XCAR (arg);
7704   workbuf = XCDR (arg);
7705   if (! NILP (workbuf))
7706     {
7707       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7708         reused_workbuf_in_use = 0;
7709       else if (! NILP (Fbuffer_live_p (workbuf)))
7710         Fkill_buffer (workbuf);
7711     }
7712   set_buffer_internal (XBUFFER (current));
7713   UNGCPRO;
7714   return Qnil;
7715 }
7716
7717 Lisp_Object
7718 code_conversion_save (with_work_buf, multibyte)
7719      int with_work_buf, multibyte;
7720 {
7721   Lisp_Object workbuf = Qnil;
7722
7723   if (with_work_buf)
7724     workbuf = make_conversion_work_buffer (multibyte);
7725   record_unwind_protect (code_conversion_restore,
7726                          Fcons (Fcurrent_buffer (), workbuf));
7727   return workbuf;
7728 }
7729
7730 int
7731 decode_coding_gap (coding, chars, bytes)
7732      struct coding_system *coding;
7733      EMACS_INT chars, bytes;
7734 {
7735   int count = specpdl_ptr - specpdl;
7736   Lisp_Object attrs;
7737
7738   code_conversion_save (0, 0);
7739
7740   coding->src_object = Fcurrent_buffer ();
7741   coding->src_chars = chars;
7742   coding->src_bytes = bytes;
7743   coding->src_pos = -chars;
7744   coding->src_pos_byte = -bytes;
7745   coding->src_multibyte = chars < bytes;
7746   coding->dst_object = coding->src_object;
7747   coding->dst_pos = PT;
7748   coding->dst_pos_byte = PT_BYTE;
7749   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7750
7751   if (CODING_REQUIRE_DETECTION (coding))
7752     detect_coding (coding);
7753
7754   coding->mode |= CODING_MODE_LAST_BLOCK;
7755   current_buffer->text->inhibit_shrinking = 1;
7756   decode_coding (coding);
7757   current_buffer->text->inhibit_shrinking = 0;
7758
7759   attrs = CODING_ID_ATTRS (coding->id);
7760   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7761     {
7762       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7763       Lisp_Object val;
7764
7765       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7766       val = call1 (CODING_ATTR_POST_READ (attrs),
7767                    make_number (coding->produced_char));
7768       CHECK_NATNUM (val);
7769       coding->produced_char += Z - prev_Z;
7770       coding->produced += Z_BYTE - prev_Z_BYTE;
7771     }
7772
7773   unbind_to (count, Qnil);
7774   return coding->result;
7775 }
7776
7777 int
7778 encode_coding_gap (coding, chars, bytes)
7779      struct coding_system *coding;
7780      EMACS_INT chars, bytes;
7781 {
7782   int count = specpdl_ptr - specpdl;
7783
7784   code_conversion_save (0, 0);
7785
7786   coding->src_object = Fcurrent_buffer ();
7787   coding->src_chars = chars;
7788   coding->src_bytes = bytes;
7789   coding->src_pos = -chars;
7790   coding->src_pos_byte = -bytes;
7791   coding->src_multibyte = chars < bytes;
7792   coding->dst_object = coding->src_object;
7793   coding->dst_pos = PT;
7794   coding->dst_pos_byte = PT_BYTE;
7795
7796   encode_coding (coding);
7797
7798   unbind_to (count, Qnil);
7799   return coding->result;
7800 }
7801
7802
7803 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7804    SRC_OBJECT into DST_OBJECT by coding context CODING.
7805
7806    SRC_OBJECT is a buffer, a string, or Qnil.
7807
7808    If it is a buffer, the text is at point of the buffer.  FROM and TO
7809    are positions in the buffer.
7810
7811    If it is a string, the text is at the beginning of the string.
7812    FROM and TO are indices to the string.
7813
7814    If it is nil, the text is at coding->source.  FROM and TO are
7815    indices to coding->source.
7816
7817    DST_OBJECT is a buffer, Qt, or Qnil.
7818
7819    If it is a buffer, the decoded text is inserted at point of the
7820    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7821    is deleted.
7822
7823    If it is Qt, a string is made from the decoded text, and
7824    set in CODING->dst_object.
7825
7826    If it is Qnil, the decoded text is stored at CODING->destination.
7827    The caller must allocate CODING->dst_bytes bytes at
7828    CODING->destination by xmalloc.  If the decoded text is longer than
7829    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7830  */
7831
7832 void
7833 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7834                       dst_object)
7835      struct coding_system *coding;
7836      Lisp_Object src_object;
7837      EMACS_INT from, from_byte, to, to_byte;
7838      Lisp_Object dst_object;
7839 {
7840   int count = specpdl_ptr - specpdl;
7841   unsigned char *destination;
7842   EMACS_INT dst_bytes;
7843   EMACS_INT chars = to - from;
7844   EMACS_INT bytes = to_byte - from_byte;
7845   Lisp_Object attrs;
7846   int saved_pt = -1, saved_pt_byte;
7847   int need_marker_adjustment = 0;
7848   Lisp_Object old_deactivate_mark;
7849
7850   old_deactivate_mark = Vdeactivate_mark;
7851
7852   if (NILP (dst_object))
7853     {
7854       destination = coding->destination;
7855       dst_bytes = coding->dst_bytes;
7856     }
7857
7858   coding->src_object = src_object;
7859   coding->src_chars = chars;
7860   coding->src_bytes = bytes;
7861   coding->src_multibyte = chars < bytes;
7862
7863   if (STRINGP (src_object))
7864     {
7865       coding->src_pos = from;
7866       coding->src_pos_byte = from_byte;
7867     }
7868   else if (BUFFERP (src_object))
7869     {
7870       set_buffer_internal (XBUFFER (src_object));
7871       if (from != GPT)
7872         move_gap_both (from, from_byte);
7873       if (EQ (src_object, dst_object))
7874         {
7875           struct Lisp_Marker *tail;
7876
7877           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7878             {
7879               tail->need_adjustment
7880                 = tail->charpos == (tail->insertion_type ? from : to);
7881               need_marker_adjustment |= tail->need_adjustment;
7882             }
7883           saved_pt = PT, saved_pt_byte = PT_BYTE;
7884           TEMP_SET_PT_BOTH (from, from_byte);
7885           current_buffer->text->inhibit_shrinking = 1;
7886           del_range_both (from, from_byte, to, to_byte, 1);
7887           coding->src_pos = -chars;
7888           coding->src_pos_byte = -bytes;
7889         }
7890       else
7891         {
7892           coding->src_pos = from;
7893           coding->src_pos_byte = from_byte;
7894         }
7895     }
7896
7897   if (CODING_REQUIRE_DETECTION (coding))
7898     detect_coding (coding);
7899   attrs = CODING_ID_ATTRS (coding->id);
7900
7901   if (EQ (dst_object, Qt)
7902       || (! NILP (CODING_ATTR_POST_READ (attrs))
7903           && NILP (dst_object)))
7904     {
7905       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7906       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7907       coding->dst_pos = BEG;
7908       coding->dst_pos_byte = BEG_BYTE;
7909     }
7910   else if (BUFFERP (dst_object))
7911     {
7912       code_conversion_save (0, 0);
7913       coding->dst_object = dst_object;
7914       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7915       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7916       coding->dst_multibyte
7917         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7918     }
7919   else
7920     {
7921       code_conversion_save (0, 0);
7922       coding->dst_object = Qnil;
7923       /* Most callers presume this will return a multibyte result, and they
7924          won't use `binary' or `raw-text' anyway, so let's not worry about
7925          CODING_FOR_UNIBYTE.  */
7926       coding->dst_multibyte = 1;
7927     }
7928
7929   decode_coding (coding);
7930
7931   if (BUFFERP (coding->dst_object))
7932     set_buffer_internal (XBUFFER (coding->dst_object));
7933
7934   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7935     {
7936       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7937       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7938       Lisp_Object val;
7939
7940       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7941       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7942               old_deactivate_mark);
7943       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7944                         make_number (coding->produced_char));
7945       UNGCPRO;
7946       CHECK_NATNUM (val);
7947       coding->produced_char += Z - prev_Z;
7948       coding->produced += Z_BYTE - prev_Z_BYTE;
7949     }
7950
7951   if (EQ (dst_object, Qt))
7952     {
7953       coding->dst_object = Fbuffer_string ();
7954     }
7955   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7956     {
7957       set_buffer_internal (XBUFFER (coding->dst_object));
7958       if (dst_bytes < coding->produced)
7959         {
7960           destination = xrealloc (destination, coding->produced);
7961           if (! destination)
7962             {
7963               record_conversion_result (coding,
7964                                         CODING_RESULT_INSUFFICIENT_MEM);
7965               unbind_to (count, Qnil);
7966               return;
7967             }
7968           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7969             move_gap_both (BEGV, BEGV_BYTE);
7970           bcopy (BEGV_ADDR, destination, coding->produced);
7971           coding->destination = destination;
7972         }
7973     }
7974
7975   if (saved_pt >= 0)
7976     {
7977       /* This is the case of:
7978          (BUFFERP (src_object) && EQ (src_object, dst_object))
7979          As we have moved PT while replacing the original buffer
7980          contents, we must recover it now.  */
7981       set_buffer_internal (XBUFFER (src_object));
7982       current_buffer->text->inhibit_shrinking = 0;
7983       if (saved_pt < from)
7984         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7985       else if (saved_pt < from + chars)
7986         TEMP_SET_PT_BOTH (from, from_byte);
7987       else if (! NILP (current_buffer->enable_multibyte_characters))
7988         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7989                           saved_pt_byte + (coding->produced - bytes));
7990       else
7991         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7992                           saved_pt_byte + (coding->produced - bytes));
7993
7994       if (need_marker_adjustment)
7995         {
7996           struct Lisp_Marker *tail;
7997
7998           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7999             if (tail->need_adjustment)
8000               {
8001                 tail->need_adjustment = 0;
8002                 if (tail->insertion_type)
8003                   {
8004                     tail->bytepos = from_byte;
8005                     tail->charpos = from;
8006                   }
8007                 else
8008                   {
8009                     tail->bytepos = from_byte + coding->produced;
8010                     tail->charpos
8011                       = (NILP (current_buffer->enable_multibyte_characters)
8012                          ? tail->bytepos : from + coding->produced_char);
8013                   }
8014               }
8015         }
8016     }
8017
8018   Vdeactivate_mark = old_deactivate_mark;
8019   unbind_to (count, coding->dst_object);
8020 }
8021
8022
8023 void
8024 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
8025                       dst_object)
8026      struct coding_system *coding;
8027      Lisp_Object src_object;
8028      EMACS_INT from, from_byte, to, to_byte;
8029      Lisp_Object dst_object;
8030 {
8031   int count = specpdl_ptr - specpdl;
8032   EMACS_INT chars = to - from;
8033   EMACS_INT bytes = to_byte - from_byte;
8034   Lisp_Object attrs;
8035   int saved_pt = -1, saved_pt_byte;
8036   int need_marker_adjustment = 0;
8037   int kill_src_buffer = 0;
8038   Lisp_Object old_deactivate_mark;
8039
8040   old_deactivate_mark = Vdeactivate_mark;
8041
8042   coding->src_object = src_object;
8043   coding->src_chars = chars;
8044   coding->src_bytes = bytes;
8045   coding->src_multibyte = chars < bytes;
8046
8047   attrs = CODING_ID_ATTRS (coding->id);
8048
8049   if (EQ (src_object, dst_object))
8050     {
8051       struct Lisp_Marker *tail;
8052
8053       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8054         {
8055           tail->need_adjustment
8056             = tail->charpos == (tail->insertion_type ? from : to);
8057           need_marker_adjustment |= tail->need_adjustment;
8058         }
8059     }
8060
8061   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8062     {
8063       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8064       set_buffer_internal (XBUFFER (coding->src_object));
8065       if (STRINGP (src_object))
8066         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8067       else if (BUFFERP (src_object))
8068         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8069       else
8070         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
8071
8072       if (EQ (src_object, dst_object))
8073         {
8074           set_buffer_internal (XBUFFER (src_object));
8075           saved_pt = PT, saved_pt_byte = PT_BYTE;
8076           del_range_both (from, from_byte, to, to_byte, 1);
8077           set_buffer_internal (XBUFFER (coding->src_object));
8078         }
8079
8080       {
8081         Lisp_Object args[3];
8082         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8083
8084         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8085                 old_deactivate_mark);
8086         args[0] = CODING_ATTR_PRE_WRITE (attrs);
8087         args[1] = make_number (BEG);
8088         args[2] = make_number (Z);
8089         safe_call (3, args);
8090         UNGCPRO;
8091       }
8092       if (XBUFFER (coding->src_object) != current_buffer)
8093         kill_src_buffer = 1;
8094       coding->src_object = Fcurrent_buffer ();
8095       if (BEG != GPT)
8096         move_gap_both (BEG, BEG_BYTE);
8097       coding->src_chars = Z - BEG;
8098       coding->src_bytes = Z_BYTE - BEG_BYTE;
8099       coding->src_pos = BEG;
8100       coding->src_pos_byte = BEG_BYTE;
8101       coding->src_multibyte = Z < Z_BYTE;
8102     }
8103   else if (STRINGP (src_object))
8104     {
8105       code_conversion_save (0, 0);
8106       coding->src_pos = from;
8107       coding->src_pos_byte = from_byte;
8108     }
8109   else if (BUFFERP (src_object))
8110     {
8111       code_conversion_save (0, 0);
8112       set_buffer_internal (XBUFFER (src_object));
8113       if (EQ (src_object, dst_object))
8114         {
8115           saved_pt = PT, saved_pt_byte = PT_BYTE;
8116           coding->src_object = del_range_1 (from, to, 1, 1);
8117           coding->src_pos = 0;
8118           coding->src_pos_byte = 0;
8119         }
8120       else
8121         {
8122           if (from < GPT && to >= GPT)
8123             move_gap_both (from, from_byte);
8124           coding->src_pos = from;
8125           coding->src_pos_byte = from_byte;
8126         }
8127     }
8128   else
8129     code_conversion_save (0, 0);
8130
8131   if (BUFFERP (dst_object))
8132     {
8133       coding->dst_object = dst_object;
8134       if (EQ (src_object, dst_object))
8135         {
8136           coding->dst_pos = from;
8137           coding->dst_pos_byte = from_byte;
8138         }
8139       else
8140         {
8141           struct buffer *current = current_buffer;
8142
8143           set_buffer_temp (XBUFFER (dst_object));
8144           coding->dst_pos = PT;
8145           coding->dst_pos_byte = PT_BYTE;
8146           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8147           set_buffer_temp (current);
8148         }
8149       coding->dst_multibyte
8150         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8151     }
8152   else if (EQ (dst_object, Qt))
8153     {
8154       coding->dst_object = Qnil;
8155       coding->dst_bytes = coding->src_chars;
8156       if (coding->dst_bytes == 0)
8157         coding->dst_bytes = 1;
8158       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8159       coding->dst_multibyte = 0;
8160     }
8161   else
8162     {
8163       coding->dst_object = Qnil;
8164       coding->dst_multibyte = 0;
8165     }
8166
8167   encode_coding (coding);
8168
8169   if (EQ (dst_object, Qt))
8170     {
8171       if (BUFFERP (coding->dst_object))
8172         coding->dst_object = Fbuffer_string ();
8173       else
8174         {
8175           coding->dst_object
8176             = make_unibyte_string ((char *) coding->destination,
8177                                    coding->produced);
8178           xfree (coding->destination);
8179         }
8180     }
8181
8182   if (saved_pt >= 0)
8183     {
8184       /* This is the case of:
8185          (BUFFERP (src_object) && EQ (src_object, dst_object))
8186          As we have moved PT while replacing the original buffer
8187          contents, we must recover it now.  */
8188       set_buffer_internal (XBUFFER (src_object));
8189       if (saved_pt < from)
8190         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8191       else if (saved_pt < from + chars)
8192         TEMP_SET_PT_BOTH (from, from_byte);
8193       else if (! NILP (current_buffer->enable_multibyte_characters))
8194         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8195                           saved_pt_byte + (coding->produced - bytes));
8196       else
8197         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8198                           saved_pt_byte + (coding->produced - bytes));
8199
8200       if (need_marker_adjustment)
8201         {
8202           struct Lisp_Marker *tail;
8203
8204           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8205             if (tail->need_adjustment)
8206               {
8207                 tail->need_adjustment = 0;
8208                 if (tail->insertion_type)
8209                   {
8210                     tail->bytepos = from_byte;
8211                     tail->charpos = from;
8212                   }
8213                 else
8214                   {
8215                     tail->bytepos = from_byte + coding->produced;
8216                     tail->charpos
8217                       = (NILP (current_buffer->enable_multibyte_characters)
8218                          ? tail->bytepos : from + coding->produced_char);
8219                   }
8220               }
8221         }
8222     }
8223
8224   if (kill_src_buffer)
8225     Fkill_buffer (coding->src_object);
8226
8227   Vdeactivate_mark = old_deactivate_mark;
8228   unbind_to (count, Qnil);
8229 }
8230
8231
8232 Lisp_Object
8233 preferred_coding_system ()
8234 {
8235   int id = coding_categories[coding_priorities[0]].id;
8236
8237   return CODING_ID_NAME (id);
8238 }
8239
8240 \f
8241 #ifdef emacs
8242 /*** 8. Emacs Lisp library functions ***/
8243
8244 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8245        doc: /* Return t if OBJECT is nil or a coding-system.
8246 See the documentation of `define-coding-system' for information
8247 about coding-system objects.  */)
8248      (object)
8249      Lisp_Object object;
8250 {
8251   if (NILP (object)
8252       || CODING_SYSTEM_ID (object) >= 0)
8253     return Qt;
8254   if (! SYMBOLP (object)
8255       || NILP (Fget (object, Qcoding_system_define_form)))
8256     return Qnil;
8257   return Qt;
8258 }
8259
8260 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8261        Sread_non_nil_coding_system, 1, 1, 0,
8262        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8263      (prompt)
8264      Lisp_Object prompt;
8265 {
8266   Lisp_Object val;
8267   do
8268     {
8269       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8270                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8271     }
8272   while (SCHARS (val) == 0);
8273   return (Fintern (val, Qnil));
8274 }
8275
8276 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8277        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8278 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8279 Ignores case when completing coding systems (all Emacs coding systems
8280 are lower-case).  */)
8281      (prompt, default_coding_system)
8282      Lisp_Object prompt, default_coding_system;
8283 {
8284   Lisp_Object val;
8285   int count = SPECPDL_INDEX ();
8286
8287   if (SYMBOLP (default_coding_system))
8288     default_coding_system = SYMBOL_NAME (default_coding_system);
8289   specbind (Qcompletion_ignore_case, Qt);
8290   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8291                           Qt, Qnil, Qcoding_system_history,
8292                           default_coding_system, Qnil);
8293   unbind_to (count, Qnil);
8294   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8295 }
8296
8297 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8298        1, 1, 0,
8299        doc: /* Check validity of CODING-SYSTEM.
8300 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8301 It is valid if it is nil or a symbol defined as a coding system by the
8302 function `define-coding-system'.  */)
8303   (coding_system)
8304      Lisp_Object coding_system;
8305 {
8306   Lisp_Object define_form;
8307
8308   define_form = Fget (coding_system, Qcoding_system_define_form);
8309   if (! NILP (define_form))
8310     {
8311       Fput (coding_system, Qcoding_system_define_form, Qnil);
8312       safe_eval (define_form);
8313     }
8314   if (!NILP (Fcoding_system_p (coding_system)))
8315     return coding_system;
8316   xsignal1 (Qcoding_system_error, coding_system);
8317 }
8318
8319 \f
8320 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8321    HIGHEST is nonzero, return the coding system of the highest
8322    priority among the detected coding systems.  Otherwize return a
8323    list of detected coding systems sorted by their priorities.  If
8324    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8325    multibyte form but contains only ASCII and eight-bit chars.
8326    Otherwise, the bytes are raw bytes.
8327
8328    CODING-SYSTEM controls the detection as below:
8329
8330    If it is nil, detect both text-format and eol-format.  If the
8331    text-format part of CODING-SYSTEM is already specified
8332    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8333    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8334    detect only text-format.  */
8335
8336 Lisp_Object
8337 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8338                       coding_system)
8339      const unsigned char *src;
8340      EMACS_INT src_chars, src_bytes;
8341      int highest;
8342      int multibytep;
8343      Lisp_Object coding_system;
8344 {
8345   const unsigned char *src_end = src + src_bytes;
8346   Lisp_Object attrs, eol_type;
8347   Lisp_Object val = Qnil;
8348   struct coding_system coding;
8349   int id;
8350   struct coding_detection_info detect_info;
8351   enum coding_category base_category;
8352   int null_byte_found = 0, eight_bit_found = 0;
8353
8354   if (NILP (coding_system))
8355     coding_system = Qundecided;
8356   setup_coding_system (coding_system, &coding);
8357   attrs = CODING_ID_ATTRS (coding.id);
8358   eol_type = CODING_ID_EOL_TYPE (coding.id);
8359   coding_system = CODING_ATTR_BASE_NAME (attrs);
8360
8361   coding.source = src;
8362   coding.src_chars = src_chars;
8363   coding.src_bytes = src_bytes;
8364   coding.src_multibyte = multibytep;
8365   coding.consumed = 0;
8366   coding.mode |= CODING_MODE_LAST_BLOCK;
8367   coding.head_ascii = 0;
8368
8369   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8370
8371   /* At first, detect text-format if necessary.  */
8372   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8373   if (base_category == coding_category_undecided)
8374     {
8375       enum coding_category category;
8376       struct coding_system *this;
8377       int c, i;
8378
8379       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8380       for (; src < src_end; src++)
8381         {
8382           c = *src;
8383           if (c & 0x80)
8384             {
8385               eight_bit_found = 1;
8386               if (null_byte_found)
8387                 break;
8388             }
8389           else if (c < 0x20)
8390             {
8391               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8392                   && ! inhibit_iso_escape_detection
8393                   && ! detect_info.checked)
8394                 {
8395                   if (detect_coding_iso_2022 (&coding, &detect_info))
8396                     {
8397                       /* We have scanned the whole data.  */
8398                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8399                         {
8400                           /* We didn't find an 8-bit code.  We may
8401                              have found a null-byte, but it's very
8402                              rare that a binary file confirm to
8403                              ISO-2022.  */
8404                           src = src_end;
8405                           coding.head_ascii = src - coding.source;
8406                         }
8407                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8408                       break;
8409                     }
8410                 }
8411               else if (! c && !inhibit_null_byte_detection)
8412                 {
8413                   null_byte_found = 1;
8414                   if (eight_bit_found)
8415                     break;
8416                 }
8417               if (! eight_bit_found)
8418                 coding.head_ascii++;
8419             }
8420           else if (! eight_bit_found)
8421             coding.head_ascii++;
8422         }
8423
8424       if (null_byte_found || eight_bit_found
8425           || coding.head_ascii < coding.src_bytes
8426           || detect_info.found)
8427         {
8428           if (coding.head_ascii == coding.src_bytes)
8429             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8430             for (i = 0; i < coding_category_raw_text; i++)
8431               {
8432                 category = coding_priorities[i];
8433                 this = coding_categories + category;
8434                 if (detect_info.found & (1 << category))
8435                   break;
8436               }
8437           else
8438             {
8439               if (null_byte_found)
8440                 {
8441                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8442                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8443                 }
8444               for (i = 0; i < coding_category_raw_text; i++)
8445                 {
8446                   category = coding_priorities[i];
8447                   this = coding_categories + category;
8448
8449                   if (this->id < 0)
8450                     {
8451                       /* No coding system of this category is defined.  */
8452                       detect_info.rejected |= (1 << category);
8453                     }
8454                   else if (category >= coding_category_raw_text)
8455                     continue;
8456                   else if (detect_info.checked & (1 << category))
8457                     {
8458                       if (highest
8459                           && (detect_info.found & (1 << category)))
8460                         break;
8461                     }
8462                   else if ((*(this->detector)) (&coding, &detect_info)
8463                            && highest
8464                            && (detect_info.found & (1 << category)))
8465                     {
8466                       if (category == coding_category_utf_16_auto)
8467                         {
8468                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8469                             category = coding_category_utf_16_le;
8470                           else
8471                             category = coding_category_utf_16_be;
8472                         }
8473                       break;
8474                     }
8475                 }
8476             }
8477         }
8478
8479       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8480           || null_byte_found)
8481         {
8482           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8483           id = CODING_SYSTEM_ID (Qno_conversion);
8484           val = Fcons (make_number (id), Qnil);
8485         }
8486       else if (! detect_info.rejected && ! detect_info.found)
8487         {
8488           detect_info.found = CATEGORY_MASK_ANY;
8489           id = coding_categories[coding_category_undecided].id;
8490           val = Fcons (make_number (id), Qnil);
8491         }
8492       else if (highest)
8493         {
8494           if (detect_info.found)
8495             {
8496               detect_info.found = 1 << category;
8497               val = Fcons (make_number (this->id), Qnil);
8498             }
8499           else
8500             for (i = 0; i < coding_category_raw_text; i++)
8501               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8502                 {
8503                   detect_info.found = 1 << coding_priorities[i];
8504                   id = coding_categories[coding_priorities[i]].id;
8505                   val = Fcons (make_number (id), Qnil);
8506                   break;
8507                 }
8508         }
8509       else
8510         {
8511           int mask = detect_info.rejected | detect_info.found;
8512           int found = 0;
8513
8514           for (i = coding_category_raw_text - 1; i >= 0; i--)
8515             {
8516               category = coding_priorities[i];
8517               if (! (mask & (1 << category)))
8518                 {
8519                   found |= 1 << category;
8520                   id = coding_categories[category].id;
8521                   if (id >= 0)
8522                     val = Fcons (make_number (id), val);
8523                 }
8524             }
8525           for (i = coding_category_raw_text - 1; i >= 0; i--)
8526             {
8527               category = coding_priorities[i];
8528               if (detect_info.found & (1 << category))
8529                 {
8530                   id = coding_categories[category].id;
8531                   val = Fcons (make_number (id), val);
8532                 }
8533             }
8534           detect_info.found |= found;
8535         }
8536     }
8537   else if (base_category == coding_category_utf_8_auto)
8538     {
8539       if (detect_coding_utf_8 (&coding, &detect_info))
8540         {
8541           struct coding_system *this;
8542
8543           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8544             this = coding_categories + coding_category_utf_8_sig;
8545           else
8546             this = coding_categories + coding_category_utf_8_nosig;
8547           val = Fcons (make_number (this->id), Qnil);
8548         }
8549     }
8550   else if (base_category == coding_category_utf_16_auto)
8551     {
8552       if (detect_coding_utf_16 (&coding, &detect_info))
8553         {
8554           struct coding_system *this;
8555
8556           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8557             this = coding_categories + coding_category_utf_16_le;
8558           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8559             this = coding_categories + coding_category_utf_16_be;
8560           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8561             this = coding_categories + coding_category_utf_16_be_nosig;
8562           else
8563             this = coding_categories + coding_category_utf_16_le_nosig;
8564           val = Fcons (make_number (this->id), Qnil);
8565         }
8566     }
8567   else
8568     {
8569       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8570       val = Fcons (make_number (coding.id), Qnil);
8571     }
8572
8573   /* Then, detect eol-format if necessary.  */
8574   {
8575     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8576     Lisp_Object tail;
8577
8578     if (VECTORP (eol_type))
8579       {
8580         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8581           {
8582             if (null_byte_found)
8583               normal_eol = EOL_SEEN_LF;
8584             else
8585               normal_eol = detect_eol (coding.source, src_bytes,
8586                                        coding_category_raw_text);
8587           }
8588         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8589                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8590           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8591                                       coding_category_utf_16_be);
8592         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8593                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8594           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8595                                       coding_category_utf_16_le);
8596       }
8597     else
8598       {
8599         if (EQ (eol_type, Qunix))
8600           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8601         else if (EQ (eol_type, Qdos))
8602           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8603         else
8604           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8605       }
8606
8607     for (tail = val; CONSP (tail); tail = XCDR (tail))
8608       {
8609         enum coding_category category;
8610         int this_eol;
8611
8612         id = XINT (XCAR (tail));
8613         attrs = CODING_ID_ATTRS (id);
8614         category = XINT (CODING_ATTR_CATEGORY (attrs));
8615         eol_type = CODING_ID_EOL_TYPE (id);
8616         if (VECTORP (eol_type))
8617           {
8618             if (category == coding_category_utf_16_be
8619                 || category == coding_category_utf_16_be_nosig)
8620               this_eol = utf_16_be_eol;
8621             else if (category == coding_category_utf_16_le
8622                      || category == coding_category_utf_16_le_nosig)
8623               this_eol = utf_16_le_eol;
8624             else
8625               this_eol = normal_eol;
8626
8627             if (this_eol == EOL_SEEN_LF)
8628               XSETCAR (tail, AREF (eol_type, 0));
8629             else if (this_eol == EOL_SEEN_CRLF)
8630               XSETCAR (tail, AREF (eol_type, 1));
8631             else if (this_eol == EOL_SEEN_CR)
8632               XSETCAR (tail, AREF (eol_type, 2));
8633             else
8634               XSETCAR (tail, CODING_ID_NAME (id));
8635           }
8636         else
8637           XSETCAR (tail, CODING_ID_NAME (id));
8638       }
8639   }
8640
8641   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8642 }
8643
8644
8645 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8646        2, 3, 0,
8647        doc: /* Detect coding system of the text in the region between START and END.
8648 Return a list of possible coding systems ordered by priority.
8649 The coding systems to try and their priorities follows what
8650 the function `coding-system-priority-list' (which see) returns.
8651
8652 If only ASCII characters are found (except for such ISO-2022 control
8653 characters as ESC), it returns a list of single element `undecided'
8654 or its subsidiary coding system according to a detected end-of-line
8655 format.
8656
8657 If optional argument HIGHEST is non-nil, return the coding system of
8658 highest priority.  */)
8659      (start, end, highest)
8660      Lisp_Object start, end, highest;
8661 {
8662   int from, to;
8663   int from_byte, to_byte;
8664
8665   CHECK_NUMBER_COERCE_MARKER (start);
8666   CHECK_NUMBER_COERCE_MARKER (end);
8667
8668   validate_region (&start, &end);
8669   from = XINT (start), to = XINT (end);
8670   from_byte = CHAR_TO_BYTE (from);
8671   to_byte = CHAR_TO_BYTE (to);
8672
8673   if (from < GPT && to >= GPT)
8674     move_gap_both (to, to_byte);
8675
8676   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8677                                to - from, to_byte - from_byte,
8678                                !NILP (highest),
8679                                !NILP (current_buffer
8680                                       ->enable_multibyte_characters),
8681                                Qnil);
8682 }
8683
8684 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8685        1, 2, 0,
8686        doc: /* Detect coding system of the text in STRING.
8687 Return a list of possible coding systems ordered by priority.
8688 The coding systems to try and their priorities follows what
8689 the function `coding-system-priority-list' (which see) returns.
8690
8691 If only ASCII characters are found (except for such ISO-2022 control
8692 characters as ESC), it returns a list of single element `undecided'
8693 or its subsidiary coding system according to a detected end-of-line
8694 format.
8695
8696 If optional argument HIGHEST is non-nil, return the coding system of
8697 highest priority.  */)
8698      (string, highest)
8699      Lisp_Object string, highest;
8700 {
8701   CHECK_STRING (string);
8702
8703   return detect_coding_system (SDATA (string),
8704                                SCHARS (string), SBYTES (string),
8705                                !NILP (highest), STRING_MULTIBYTE (string),
8706                                Qnil);
8707 }
8708
8709
8710 static INLINE int
8711 char_encodable_p (c, attrs)
8712      int c;
8713      Lisp_Object attrs;
8714 {
8715   Lisp_Object tail;
8716   struct charset *charset;
8717   Lisp_Object translation_table;
8718
8719   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8720   if (! NILP (translation_table))
8721     c = translate_char (translation_table, c);
8722   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8723        CONSP (tail); tail = XCDR (tail))
8724     {
8725       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8726       if (CHAR_CHARSET_P (c, charset))
8727         break;
8728     }
8729   return (! NILP (tail));
8730 }
8731
8732
8733 /* Return a list of coding systems that safely encode the text between
8734    START and END.  If EXCLUDE is non-nil, it is a list of coding
8735    systems not to check.  The returned list doesn't contain any such
8736    coding systems.  In any case, if the text contains only ASCII or is
8737    unibyte, return t.  */
8738
8739 DEFUN ("find-coding-systems-region-internal",
8740        Ffind_coding_systems_region_internal,
8741        Sfind_coding_systems_region_internal, 2, 3, 0,
8742        doc: /* Internal use only.  */)
8743      (start, end, exclude)
8744      Lisp_Object start, end, exclude;
8745 {
8746   Lisp_Object coding_attrs_list, safe_codings;
8747   EMACS_INT start_byte, end_byte;
8748   const unsigned char *p, *pbeg, *pend;
8749   int c;
8750   Lisp_Object tail, elt, work_table;
8751
8752   if (STRINGP (start))
8753     {
8754       if (!STRING_MULTIBYTE (start)
8755           || SCHARS (start) == SBYTES (start))
8756         return Qt;
8757       start_byte = 0;
8758       end_byte = SBYTES (start);
8759     }
8760   else
8761     {
8762       CHECK_NUMBER_COERCE_MARKER (start);
8763       CHECK_NUMBER_COERCE_MARKER (end);
8764       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8765         args_out_of_range (start, end);
8766       if (NILP (current_buffer->enable_multibyte_characters))
8767         return Qt;
8768       start_byte = CHAR_TO_BYTE (XINT (start));
8769       end_byte = CHAR_TO_BYTE (XINT (end));
8770       if (XINT (end) - XINT (start) == end_byte - start_byte)
8771         return Qt;
8772
8773       if (XINT (start) < GPT && XINT (end) > GPT)
8774         {
8775           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8776             move_gap_both (XINT (start), start_byte);
8777           else
8778             move_gap_both (XINT (end), end_byte);
8779         }
8780     }
8781
8782   coding_attrs_list = Qnil;
8783   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8784     if (NILP (exclude)
8785         || NILP (Fmemq (XCAR (tail), exclude)))
8786       {
8787         Lisp_Object attrs;
8788
8789         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8790         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8791             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8792           {
8793             ASET (attrs, coding_attr_trans_tbl,
8794                   get_translation_table (attrs, 1, NULL));
8795             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8796           }
8797       }
8798
8799   if (STRINGP (start))
8800     p = pbeg = SDATA (start);
8801   else
8802     p = pbeg = BYTE_POS_ADDR (start_byte);
8803   pend = p + (end_byte - start_byte);
8804
8805   while (p < pend && ASCII_BYTE_P (*p)) p++;
8806   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8807
8808   work_table = Fmake_char_table (Qnil, Qnil);
8809   while (p < pend)
8810     {
8811       if (ASCII_BYTE_P (*p))
8812         p++;
8813       else
8814         {
8815           c = STRING_CHAR_ADVANCE (p);
8816           if (!NILP (char_table_ref (work_table, c)))
8817             /* This character was already checked.  Ignore it.  */
8818             continue;
8819
8820           charset_map_loaded = 0;
8821           for (tail = coding_attrs_list; CONSP (tail);)
8822             {
8823               elt = XCAR (tail);
8824               if (NILP (elt))
8825                 tail = XCDR (tail);
8826               else if (char_encodable_p (c, elt))
8827                 tail = XCDR (tail);
8828               else if (CONSP (XCDR (tail)))
8829                 {
8830                   XSETCAR (tail, XCAR (XCDR (tail)));
8831                   XSETCDR (tail, XCDR (XCDR (tail)));
8832                 }
8833               else
8834                 {
8835                   XSETCAR (tail, Qnil);
8836                   tail = XCDR (tail);
8837                 }
8838             }
8839           if (charset_map_loaded)
8840             {
8841               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8842
8843               if (STRINGP (start))
8844                 pbeg = SDATA (start);
8845               else
8846                 pbeg = BYTE_POS_ADDR (start_byte);
8847               p = pbeg + p_offset;
8848               pend = pbeg + pend_offset;
8849             }
8850           char_table_set (work_table, c, Qt);
8851         }
8852     }
8853
8854   safe_codings = list2 (Qraw_text, Qno_conversion);
8855   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8856     if (! NILP (XCAR (tail)))
8857       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8858
8859   return safe_codings;
8860 }
8861
8862
8863 DEFUN ("unencodable-char-position", Funencodable_char_position,
8864        Sunencodable_char_position, 3, 5, 0,
8865        doc: /*
8866 Return position of first un-encodable character in a region.
8867 START and END specify the region and CODING-SYSTEM specifies the
8868 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8869
8870 If optional 4th argument COUNT is non-nil, it specifies at most how
8871 many un-encodable characters to search.  In this case, the value is a
8872 list of positions.
8873
8874 If optional 5th argument STRING is non-nil, it is a string to search
8875 for un-encodable characters.  In that case, START and END are indexes
8876 to the string.  */)
8877      (start, end, coding_system, count, string)
8878      Lisp_Object start, end, coding_system, count, string;
8879 {
8880   int n;
8881   struct coding_system coding;
8882   Lisp_Object attrs, charset_list, translation_table;
8883   Lisp_Object positions;
8884   int from, to;
8885   const unsigned char *p, *stop, *pend;
8886   int ascii_compatible;
8887
8888   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8889   attrs = CODING_ID_ATTRS (coding.id);
8890   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8891     return Qnil;
8892   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8893   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8894   translation_table = get_translation_table (attrs, 1, NULL);
8895
8896   if (NILP (string))
8897     {
8898       validate_region (&start, &end);
8899       from = XINT (start);
8900       to = XINT (end);
8901       if (NILP (current_buffer->enable_multibyte_characters)
8902           || (ascii_compatible
8903               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8904         return Qnil;
8905       p = CHAR_POS_ADDR (from);
8906       pend = CHAR_POS_ADDR (to);
8907       if (from < GPT && to >= GPT)
8908         stop = GPT_ADDR;
8909       else
8910         stop = pend;
8911     }
8912   else
8913     {
8914       CHECK_STRING (string);
8915       CHECK_NATNUM (start);
8916       CHECK_NATNUM (end);
8917       from = XINT (start);
8918       to = XINT (end);
8919       if (from > to
8920           || to > SCHARS (string))
8921         args_out_of_range_3 (string, start, end);
8922       if (! STRING_MULTIBYTE (string))
8923         return Qnil;
8924       p = SDATA (string) + string_char_to_byte (string, from);
8925       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8926       if (ascii_compatible && (to - from) == (pend - p))
8927         return Qnil;
8928     }
8929
8930   if (NILP (count))
8931     n = 1;
8932   else
8933     {
8934       CHECK_NATNUM (count);
8935       n = XINT (count);
8936     }
8937
8938   positions = Qnil;
8939   while (1)
8940     {
8941       int c;
8942
8943       if (ascii_compatible)
8944         while (p < stop && ASCII_BYTE_P (*p))
8945           p++, from++;
8946       if (p >= stop)
8947         {
8948           if (p >= pend)
8949             break;
8950           stop = pend;
8951           p = GAP_END_ADDR;
8952         }
8953
8954       c = STRING_CHAR_ADVANCE (p);
8955       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8956           && ! char_charset (translate_char (translation_table, c),
8957                              charset_list, NULL))
8958         {
8959           positions = Fcons (make_number (from), positions);
8960           n--;
8961           if (n == 0)
8962             break;
8963         }
8964
8965       from++;
8966     }
8967
8968   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8969 }
8970
8971
8972 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8973        Scheck_coding_systems_region, 3, 3, 0,
8974        doc: /* Check if the region is encodable by coding systems.
8975
8976 START and END are buffer positions specifying the region.
8977 CODING-SYSTEM-LIST is a list of coding systems to check.
8978
8979 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8980 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8981 whole region, POS0, POS1, ... are buffer positions where non-encodable
8982 characters are found.
8983
8984 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8985 value is nil.
8986
8987 START may be a string.  In that case, check if the string is
8988 encodable, and the value contains indices to the string instead of
8989 buffer positions.  END is ignored.
8990
8991 If the current buffer (or START if it is a string) is unibyte, the value
8992 is nil.  */)
8993      (start, end, coding_system_list)
8994      Lisp_Object start, end, coding_system_list;
8995 {
8996   Lisp_Object list;
8997   EMACS_INT start_byte, end_byte;
8998   int pos;
8999   const unsigned char *p, *pbeg, *pend;
9000   int c;
9001   Lisp_Object tail, elt, attrs;
9002
9003   if (STRINGP (start))
9004     {
9005       if (!STRING_MULTIBYTE (start)
9006           || SCHARS (start) == SBYTES (start))
9007         return Qnil;
9008       start_byte = 0;
9009       end_byte = SBYTES (start);
9010       pos = 0;
9011     }
9012   else
9013     {
9014       CHECK_NUMBER_COERCE_MARKER (start);
9015       CHECK_NUMBER_COERCE_MARKER (end);
9016       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9017         args_out_of_range (start, end);
9018       if (NILP (current_buffer->enable_multibyte_characters))
9019         return Qnil;
9020       start_byte = CHAR_TO_BYTE (XINT (start));
9021       end_byte = CHAR_TO_BYTE (XINT (end));
9022       if (XINT (end) - XINT (start) == end_byte - start_byte)
9023         return Qnil;
9024
9025       if (XINT (start) < GPT && XINT (end) > GPT)
9026         {
9027           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9028             move_gap_both (XINT (start), start_byte);
9029           else
9030             move_gap_both (XINT (end), end_byte);
9031         }
9032       pos = XINT (start);
9033     }
9034
9035   list = Qnil;
9036   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9037     {
9038       elt = XCAR (tail);
9039       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9040       ASET (attrs, coding_attr_trans_tbl,
9041             get_translation_table (attrs, 1, NULL));
9042       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9043     }
9044
9045   if (STRINGP (start))
9046     p = pbeg = SDATA (start);
9047   else
9048     p = pbeg = BYTE_POS_ADDR (start_byte);
9049   pend = p + (end_byte - start_byte);
9050
9051   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9052   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9053
9054   while (p < pend)
9055     {
9056       if (ASCII_BYTE_P (*p))
9057         p++;
9058       else
9059         {
9060           c = STRING_CHAR_ADVANCE (p);
9061
9062           charset_map_loaded = 0;
9063           for (tail = list; CONSP (tail); tail = XCDR (tail))
9064             {
9065               elt = XCDR (XCAR (tail));
9066               if (! char_encodable_p (c, XCAR (elt)))
9067                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9068             }
9069           if (charset_map_loaded)
9070             {
9071               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
9072
9073               if (STRINGP (start))
9074                 pbeg = SDATA (start);
9075               else
9076                 pbeg = BYTE_POS_ADDR (start_byte);
9077               p = pbeg + p_offset;
9078               pend = pbeg + pend_offset;
9079             }
9080         }
9081       pos++;
9082     }
9083
9084   tail = list;
9085   list = Qnil;
9086   for (; CONSP (tail); tail = XCDR (tail))
9087     {
9088       elt = XCAR (tail);
9089       if (CONSP (XCDR (XCDR (elt))))
9090         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9091                       list);
9092     }
9093
9094   return list;
9095 }
9096
9097
9098 Lisp_Object
9099 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9100      Lisp_Object start, end, coding_system, dst_object;
9101      int encodep, norecord;
9102 {
9103   struct coding_system coding;
9104   EMACS_INT from, from_byte, to, to_byte;
9105   Lisp_Object src_object;
9106
9107   CHECK_NUMBER_COERCE_MARKER (start);
9108   CHECK_NUMBER_COERCE_MARKER (end);
9109   if (NILP (coding_system))
9110     coding_system = Qno_conversion;
9111   else
9112     CHECK_CODING_SYSTEM (coding_system);
9113   src_object = Fcurrent_buffer ();
9114   if (NILP (dst_object))
9115     dst_object = src_object;
9116   else if (! EQ (dst_object, Qt))
9117     CHECK_BUFFER (dst_object);
9118
9119   validate_region (&start, &end);
9120   from = XFASTINT (start);
9121   from_byte = CHAR_TO_BYTE (from);
9122   to = XFASTINT (end);
9123   to_byte = CHAR_TO_BYTE (to);
9124
9125   setup_coding_system (coding_system, &coding);
9126   coding.mode |= CODING_MODE_LAST_BLOCK;
9127
9128   if (encodep)
9129     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9130                           dst_object);
9131   else
9132     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9133                           dst_object);
9134   if (! norecord)
9135     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9136
9137   return (BUFFERP (dst_object)
9138           ? make_number (coding.produced_char)
9139           : coding.dst_object);
9140 }
9141
9142
9143 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9144        3, 4, "r\nzCoding system: ",
9145        doc: /* Decode the current region from the specified coding system.
9146 When called from a program, takes four arguments:
9147         START, END, CODING-SYSTEM, and DESTINATION.
9148 START and END are buffer positions.
9149
9150 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9151 If nil, the region between START and END is replaced by the decoded text.
9152 If buffer, the decoded text is inserted in that buffer after point (point
9153 does not move).
9154 In those cases, the length of the decoded text is returned.
9155 If DESTINATION is t, the decoded text is returned.
9156
9157 This function sets `last-coding-system-used' to the precise coding system
9158 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9159 not fully specified.)  */)
9160      (start, end, coding_system, destination)
9161      Lisp_Object start, end, coding_system, destination;
9162 {
9163   return code_convert_region (start, end, coding_system, destination, 0, 0);
9164 }
9165
9166 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9167        3, 4, "r\nzCoding system: ",
9168        doc: /* Encode the current region by specified coding system.
9169 When called from a program, takes four arguments:
9170         START, END, CODING-SYSTEM and DESTINATION.
9171 START and END are buffer positions.
9172
9173 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9174 If nil, the region between START and END is replace by the encoded text.
9175 If buffer, the encoded text is inserted in that buffer after point (point
9176 does not move).
9177 In those cases, the length of the encoded text is returned.
9178 If DESTINATION is t, the encoded text is returned.
9179
9180 This function sets `last-coding-system-used' to the precise coding system
9181 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9182 not fully specified.)  */)
9183   (start, end, coding_system, destination)
9184      Lisp_Object start, end, coding_system, destination;
9185 {
9186   return code_convert_region (start, end, coding_system, destination, 1, 0);
9187 }
9188
9189 Lisp_Object
9190 code_convert_string (string, coding_system, dst_object,
9191                      encodep, nocopy, norecord)
9192      Lisp_Object string, coding_system, dst_object;
9193      int encodep, nocopy, norecord;
9194 {
9195   struct coding_system coding;
9196   EMACS_INT chars, bytes;
9197
9198   CHECK_STRING (string);
9199   if (NILP (coding_system))
9200     {
9201       if (! norecord)
9202         Vlast_coding_system_used = Qno_conversion;
9203       if (NILP (dst_object))
9204         return (nocopy ? Fcopy_sequence (string) : string);
9205     }
9206
9207   if (NILP (coding_system))
9208     coding_system = Qno_conversion;
9209   else
9210     CHECK_CODING_SYSTEM (coding_system);
9211   if (NILP (dst_object))
9212     dst_object = Qt;
9213   else if (! EQ (dst_object, Qt))
9214     CHECK_BUFFER (dst_object);
9215
9216   setup_coding_system (coding_system, &coding);
9217   coding.mode |= CODING_MODE_LAST_BLOCK;
9218   chars = SCHARS (string);
9219   bytes = SBYTES (string);
9220   if (encodep)
9221     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9222   else
9223     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9224   if (! norecord)
9225     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9226
9227   return (BUFFERP (dst_object)
9228           ? make_number (coding.produced_char)
9229           : coding.dst_object);
9230 }
9231
9232
9233 /* Encode or decode STRING according to CODING_SYSTEM.
9234    Do not set Vlast_coding_system_used.
9235
9236    This function is called only from macros DECODE_FILE and
9237    ENCODE_FILE, thus we ignore character composition.  */
9238
9239 Lisp_Object
9240 code_convert_string_norecord (string, coding_system, encodep)
9241      Lisp_Object string, coding_system;
9242      int encodep;
9243 {
9244   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9245 }
9246
9247
9248 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9249        2, 4, 0,
9250        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9251
9252 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9253 if the decoding operation is trivial.
9254
9255 Optional fourth arg BUFFER non-nil means that the decoded text is
9256 inserted in that buffer after point (point does not move).  In this
9257 case, the return value is the length of the decoded text.
9258
9259 This function sets `last-coding-system-used' to the precise coding system
9260 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9261 not fully specified.)  */)
9262   (string, coding_system, nocopy, buffer)
9263      Lisp_Object string, coding_system, nocopy, buffer;
9264 {
9265   return code_convert_string (string, coding_system, buffer,
9266                               0, ! NILP (nocopy), 0);
9267 }
9268
9269 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9270        2, 4, 0,
9271        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9272
9273 Optional third arg NOCOPY non-nil means it is OK to return STRING
9274 itself if the encoding operation is trivial.
9275
9276 Optional fourth arg BUFFER non-nil means that the encoded text is
9277 inserted in that buffer after point (point does not move).  In this
9278 case, the return value is the length of the encoded text.
9279
9280 This function sets `last-coding-system-used' to the precise coding system
9281 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9282 not fully specified.)  */)
9283      (string, coding_system, nocopy, buffer)
9284      Lisp_Object string, coding_system, nocopy, buffer;
9285 {
9286   return code_convert_string (string, coding_system, buffer,
9287                               1, ! NILP (nocopy), 1);
9288 }
9289
9290 \f
9291 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9292        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9293 Return the corresponding character.  */)
9294      (code)
9295      Lisp_Object code;
9296 {
9297   Lisp_Object spec, attrs, val;
9298   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9299   int c;
9300
9301   CHECK_NATNUM (code);
9302   c = XFASTINT (code);
9303   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9304   attrs = AREF (spec, 0);
9305
9306   if (ASCII_BYTE_P (c)
9307       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9308     return code;
9309
9310   val = CODING_ATTR_CHARSET_LIST (attrs);
9311   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9312   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9313   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9314
9315   if (c <= 0x7F)
9316     charset = charset_roman;
9317   else if (c >= 0xA0 && c < 0xDF)
9318     {
9319       charset = charset_kana;
9320       c -= 0x80;
9321     }
9322   else
9323     {
9324       int s1 = c >> 8, s2 = c & 0xFF;
9325
9326       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9327           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9328         error ("Invalid code: %d", code);
9329       SJIS_TO_JIS (c);
9330       charset = charset_kanji;
9331     }
9332   c = DECODE_CHAR (charset, c);
9333   if (c < 0)
9334     error ("Invalid code: %d", code);
9335   return make_number (c);
9336 }
9337
9338
9339 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9340        doc: /* Encode a Japanese character CH to shift_jis encoding.
9341 Return the corresponding code in SJIS.  */)
9342      (ch)
9343     Lisp_Object ch;
9344 {
9345   Lisp_Object spec, attrs, charset_list;
9346   int c;
9347   struct charset *charset;
9348   unsigned code;
9349
9350   CHECK_CHARACTER (ch);
9351   c = XFASTINT (ch);
9352   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9353   attrs = AREF (spec, 0);
9354
9355   if (ASCII_CHAR_P (c)
9356       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9357     return ch;
9358
9359   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9360   charset = char_charset (c, charset_list, &code);
9361   if (code == CHARSET_INVALID_CODE (charset))
9362     error ("Can't encode by shift_jis encoding: %d", c);
9363   JIS_TO_SJIS (code);
9364
9365   return make_number (code);
9366 }
9367
9368 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9369        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9370 Return the corresponding character.  */)
9371      (code)
9372      Lisp_Object code;
9373 {
9374   Lisp_Object spec, attrs, val;
9375   struct charset *charset_roman, *charset_big5, *charset;
9376   int c;
9377
9378   CHECK_NATNUM (code);
9379   c = XFASTINT (code);
9380   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9381   attrs = AREF (spec, 0);
9382
9383   if (ASCII_BYTE_P (c)
9384       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9385     return code;
9386
9387   val = CODING_ATTR_CHARSET_LIST (attrs);
9388   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9389   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9390
9391   if (c <= 0x7F)
9392     charset = charset_roman;
9393   else
9394     {
9395       int b1 = c >> 8, b2 = c & 0x7F;
9396       if (b1 < 0xA1 || b1 > 0xFE
9397           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9398         error ("Invalid code: %d", code);
9399       charset = charset_big5;
9400     }
9401   c = DECODE_CHAR (charset, (unsigned )c);
9402   if (c < 0)
9403     error ("Invalid code: %d", code);
9404   return make_number (c);
9405 }
9406
9407 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9408        doc: /* Encode the Big5 character CH to BIG5 coding system.
9409 Return the corresponding character code in Big5.  */)
9410      (ch)
9411      Lisp_Object ch;
9412 {
9413   Lisp_Object spec, attrs, charset_list;
9414   struct charset *charset;
9415   int c;
9416   unsigned code;
9417
9418   CHECK_CHARACTER (ch);
9419   c = XFASTINT (ch);
9420   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9421   attrs = AREF (spec, 0);
9422   if (ASCII_CHAR_P (c)
9423       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9424     return ch;
9425
9426   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9427   charset = char_charset (c, charset_list, &code);
9428   if (code == CHARSET_INVALID_CODE (charset))
9429     error ("Can't encode by Big5 encoding: %d", c);
9430
9431   return make_number (code);
9432 }
9433
9434 \f
9435 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9436        Sset_terminal_coding_system_internal, 1, 2, 0,
9437        doc: /* Internal use only.  */)
9438      (coding_system, terminal)
9439      Lisp_Object coding_system;
9440      Lisp_Object terminal;
9441 {
9442   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9443   CHECK_SYMBOL (coding_system);
9444   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9445   /* We had better not send unsafe characters to terminal.  */
9446   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9447   /* Characer composition should be disabled.  */
9448   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9449   terminal_coding->src_multibyte = 1;
9450   terminal_coding->dst_multibyte = 0;
9451   return Qnil;
9452 }
9453
9454 DEFUN ("set-safe-terminal-coding-system-internal",
9455        Fset_safe_terminal_coding_system_internal,
9456        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9457        doc: /* Internal use only.  */)
9458      (coding_system)
9459      Lisp_Object coding_system;
9460 {
9461   CHECK_SYMBOL (coding_system);
9462   setup_coding_system (Fcheck_coding_system (coding_system),
9463                        &safe_terminal_coding);
9464   /* Characer composition should be disabled.  */
9465   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9466   safe_terminal_coding.src_multibyte = 1;
9467   safe_terminal_coding.dst_multibyte = 0;
9468   return Qnil;
9469 }
9470
9471 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9472        Sterminal_coding_system, 0, 1, 0,
9473        doc: /* Return coding system specified for terminal output on the given terminal.
9474 TERMINAL may be a terminal object, a frame, or nil for the selected
9475 frame's terminal device.  */)
9476      (terminal)
9477      Lisp_Object terminal;
9478 {
9479   struct coding_system *terminal_coding
9480     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9481   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9482
9483   /* For backward compatibility, return nil if it is `undecided'. */
9484   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9485 }
9486
9487 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9488        Sset_keyboard_coding_system_internal, 1, 2, 0,
9489        doc: /* Internal use only.  */)
9490      (coding_system, terminal)
9491      Lisp_Object coding_system;
9492      Lisp_Object terminal;
9493 {
9494   struct terminal *t = get_terminal (terminal, 1);
9495   CHECK_SYMBOL (coding_system);
9496   if (NILP (coding_system))
9497     coding_system = Qno_conversion;
9498   else
9499     Fcheck_coding_system (coding_system);
9500   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9501   /* Characer composition should be disabled.  */
9502   TERMINAL_KEYBOARD_CODING (t)->common_flags
9503     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9504   return Qnil;
9505 }
9506
9507 DEFUN ("keyboard-coding-system",
9508        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9509        doc: /* Return coding system specified for decoding keyboard input.  */)
9510      (terminal)
9511      Lisp_Object terminal;
9512 {
9513   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9514                          (get_terminal (terminal, 1))->id);
9515 }
9516
9517 \f
9518 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9519        Sfind_operation_coding_system,  1, MANY, 0,
9520        doc: /* Choose a coding system for an operation based on the target name.
9521 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9522 DECODING-SYSTEM is the coding system to use for decoding
9523 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9524 for encoding (in case OPERATION does encoding).
9525
9526 The first argument OPERATION specifies an I/O primitive:
9527   For file I/O, `insert-file-contents' or `write-region'.
9528   For process I/O, `call-process', `call-process-region', or `start-process'.
9529   For network I/O, `open-network-stream'.
9530
9531 The remaining arguments should be the same arguments that were passed
9532 to the primitive.  Depending on which primitive, one of those arguments
9533 is selected as the TARGET.  For example, if OPERATION does file I/O,
9534 whichever argument specifies the file name is TARGET.
9535
9536 TARGET has a meaning which depends on OPERATION:
9537   For file I/O, TARGET is a file name (except for the special case below).
9538   For process I/O, TARGET is a process name.
9539   For network I/O, TARGET is a service name or a port number.
9540
9541 This function looks up what is specified for TARGET in
9542 `file-coding-system-alist', `process-coding-system-alist',
9543 or `network-coding-system-alist' depending on OPERATION.
9544 They may specify a coding system, a cons of coding systems,
9545 or a function symbol to call.
9546 In the last case, we call the function with one argument,
9547 which is a list of all the arguments given to this function.
9548 If the function can't decide a coding system, it can return
9549 `undecided' so that the normal code-detection is performed.
9550
9551 If OPERATION is `insert-file-contents', the argument corresponding to
9552 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9553 file name to look up, and BUFFER is a buffer that contains the file's
9554 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9555 function to call for FILENAME, that function should examine the
9556 contents of BUFFER instead of reading the file.
9557
9558 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9559      (nargs, args)
9560      int nargs;
9561      Lisp_Object *args;
9562 {
9563   Lisp_Object operation, target_idx, target, val;
9564   register Lisp_Object chain;
9565
9566   if (nargs < 2)
9567     error ("Too few arguments");
9568   operation = args[0];
9569   if (!SYMBOLP (operation)
9570       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9571     error ("Invalid first argument");
9572   if (nargs < 1 + XINT (target_idx))
9573     error ("Too few arguments for operation: %s",
9574            SDATA (SYMBOL_NAME (operation)));
9575   target = args[XINT (target_idx) + 1];
9576   if (!(STRINGP (target)
9577         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9578             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9579         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9580     error ("Invalid %dth argument", XINT (target_idx) + 1);
9581   if (CONSP (target))
9582     target = XCAR (target);
9583
9584   chain = ((EQ (operation, Qinsert_file_contents)
9585             || EQ (operation, Qwrite_region))
9586            ? Vfile_coding_system_alist
9587            : (EQ (operation, Qopen_network_stream)
9588               ? Vnetwork_coding_system_alist
9589               : Vprocess_coding_system_alist));
9590   if (NILP (chain))
9591     return Qnil;
9592
9593   for (; CONSP (chain); chain = XCDR (chain))
9594     {
9595       Lisp_Object elt;
9596
9597       elt = XCAR (chain);
9598       if (CONSP (elt)
9599           && ((STRINGP (target)
9600                && STRINGP (XCAR (elt))
9601                && fast_string_match (XCAR (elt), target) >= 0)
9602               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9603         {
9604           val = XCDR (elt);
9605           /* Here, if VAL is both a valid coding system and a valid
9606              function symbol, we return VAL as a coding system.  */
9607           if (CONSP (val))
9608             return val;
9609           if (! SYMBOLP (val))
9610             return Qnil;
9611           if (! NILP (Fcoding_system_p (val)))
9612             return Fcons (val, val);
9613           if (! NILP (Ffboundp (val)))
9614             {
9615               /* We use call1 rather than safe_call1
9616                  so as to get bug reports about functions called here
9617                  which don't handle the current interface.  */
9618               val = call1 (val, Flist (nargs, args));
9619               if (CONSP (val))
9620                 return val;
9621               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9622                 return Fcons (val, val);
9623             }
9624           return Qnil;
9625         }
9626     }
9627   return Qnil;
9628 }
9629
9630 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9631        Sset_coding_system_priority, 0, MANY, 0,
9632        doc: /* Assign higher priority to the coding systems given as arguments.
9633 If multiple coding systems belong to the same category,
9634 all but the first one are ignored.
9635
9636 usage: (set-coding-system-priority &rest coding-systems)  */)
9637      (nargs, args)
9638      int nargs;
9639      Lisp_Object *args;
9640 {
9641   int i, j;
9642   int changed[coding_category_max];
9643   enum coding_category priorities[coding_category_max];
9644
9645   bzero (changed, sizeof changed);
9646
9647   for (i = j = 0; i < nargs; i++)
9648     {
9649       enum coding_category category;
9650       Lisp_Object spec, attrs;
9651
9652       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9653       attrs = AREF (spec, 0);
9654       category = XINT (CODING_ATTR_CATEGORY (attrs));
9655       if (changed[category])
9656         /* Ignore this coding system because a coding system of the
9657            same category already had a higher priority.  */
9658         continue;
9659       changed[category] = 1;
9660       priorities[j++] = category;
9661       if (coding_categories[category].id >= 0
9662           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9663         setup_coding_system (args[i], &coding_categories[category]);
9664       Fset (AREF (Vcoding_category_table, category), args[i]);
9665     }
9666
9667   /* Now we have decided top J priorities.  Reflect the order of the
9668      original priorities to the remaining priorities.  */
9669
9670   for (i = j, j = 0; i < coding_category_max; i++, j++)
9671     {
9672       while (j < coding_category_max
9673              && changed[coding_priorities[j]])
9674         j++;
9675       if (j == coding_category_max)
9676         abort ();
9677       priorities[i] = coding_priorities[j];
9678     }
9679
9680   bcopy (priorities, coding_priorities, sizeof priorities);
9681
9682   /* Update `coding-category-list'.  */
9683   Vcoding_category_list = Qnil;
9684   for (i = coding_category_max - 1; i >= 0; i--)
9685     Vcoding_category_list
9686       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9687                Vcoding_category_list);
9688
9689   return Qnil;
9690 }
9691
9692 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9693        Scoding_system_priority_list, 0, 1, 0,
9694        doc: /* Return a list of coding systems ordered by their priorities.
9695 The list contains a subset of coding systems; i.e. coding systems
9696 assigned to each coding category (see `coding-category-list').
9697
9698 HIGHESTP non-nil means just return the highest priority one.  */)
9699      (highestp)
9700      Lisp_Object highestp;
9701 {
9702   int i;
9703   Lisp_Object val;
9704
9705   for (i = 0, val = Qnil; i < coding_category_max; i++)
9706     {
9707       enum coding_category category = coding_priorities[i];
9708       int id = coding_categories[category].id;
9709       Lisp_Object attrs;
9710
9711       if (id < 0)
9712         continue;
9713       attrs = CODING_ID_ATTRS (id);
9714       if (! NILP (highestp))
9715         return CODING_ATTR_BASE_NAME (attrs);
9716       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9717     }
9718   return Fnreverse (val);
9719 }
9720
9721 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9722
9723 static Lisp_Object
9724 make_subsidiaries (base)
9725      Lisp_Object base;
9726 {
9727   Lisp_Object subsidiaries;
9728   int base_name_len = SBYTES (SYMBOL_NAME (base));
9729   char *buf = (char *) alloca (base_name_len + 6);
9730   int i;
9731
9732   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9733   subsidiaries = Fmake_vector (make_number (3), Qnil);
9734   for (i = 0; i < 3; i++)
9735     {
9736       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9737       ASET (subsidiaries, i, intern (buf));
9738     }
9739   return subsidiaries;
9740 }
9741
9742
9743 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9744        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9745        doc: /* For internal use only.
9746 usage: (define-coding-system-internal ...)  */)
9747      (nargs, args)
9748      int nargs;
9749      Lisp_Object *args;
9750 {
9751   Lisp_Object name;
9752   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9753   Lisp_Object attrs;            /* Vector of attributes.  */
9754   Lisp_Object eol_type;
9755   Lisp_Object aliases;
9756   Lisp_Object coding_type, charset_list, safe_charsets;
9757   enum coding_category category;
9758   Lisp_Object tail, val;
9759   int max_charset_id = 0;
9760   int i;
9761
9762   if (nargs < coding_arg_max)
9763     goto short_args;
9764
9765   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9766
9767   name = args[coding_arg_name];
9768   CHECK_SYMBOL (name);
9769   CODING_ATTR_BASE_NAME (attrs) = name;
9770
9771   val = args[coding_arg_mnemonic];
9772   if (! STRINGP (val))
9773     CHECK_CHARACTER (val);
9774   CODING_ATTR_MNEMONIC (attrs) = val;
9775
9776   coding_type = args[coding_arg_coding_type];
9777   CHECK_SYMBOL (coding_type);
9778   CODING_ATTR_TYPE (attrs) = coding_type;
9779
9780   charset_list = args[coding_arg_charset_list];
9781   if (SYMBOLP (charset_list))
9782     {
9783       if (EQ (charset_list, Qiso_2022))
9784         {
9785           if (! EQ (coding_type, Qiso_2022))
9786             error ("Invalid charset-list");
9787           charset_list = Viso_2022_charset_list;
9788         }
9789       else if (EQ (charset_list, Qemacs_mule))
9790         {
9791           if (! EQ (coding_type, Qemacs_mule))
9792             error ("Invalid charset-list");
9793           charset_list = Vemacs_mule_charset_list;
9794         }
9795       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9796         if (max_charset_id < XFASTINT (XCAR (tail)))
9797           max_charset_id = XFASTINT (XCAR (tail));
9798     }
9799   else
9800     {
9801       charset_list = Fcopy_sequence (charset_list);
9802       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9803         {
9804           struct charset *charset;
9805
9806           val = XCAR (tail);
9807           CHECK_CHARSET_GET_CHARSET (val, charset);
9808           if (EQ (coding_type, Qiso_2022)
9809               ? CHARSET_ISO_FINAL (charset) < 0
9810               : EQ (coding_type, Qemacs_mule)
9811               ? CHARSET_EMACS_MULE_ID (charset) < 0
9812               : 0)
9813             error ("Can't handle charset `%s'",
9814                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9815
9816           XSETCAR (tail, make_number (charset->id));
9817           if (max_charset_id < charset->id)
9818             max_charset_id = charset->id;
9819         }
9820     }
9821   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9822
9823   safe_charsets = make_uninit_string (max_charset_id + 1);
9824   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9825   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9826     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9827   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9828
9829   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9830
9831   val = args[coding_arg_decode_translation_table];
9832   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9833     CHECK_SYMBOL (val);
9834   CODING_ATTR_DECODE_TBL (attrs) = val;
9835
9836   val = args[coding_arg_encode_translation_table];
9837   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9838     CHECK_SYMBOL (val);
9839   CODING_ATTR_ENCODE_TBL (attrs) = val;
9840
9841   val = args[coding_arg_post_read_conversion];
9842   CHECK_SYMBOL (val);
9843   CODING_ATTR_POST_READ (attrs) = val;
9844
9845   val = args[coding_arg_pre_write_conversion];
9846   CHECK_SYMBOL (val);
9847   CODING_ATTR_PRE_WRITE (attrs) = val;
9848
9849   val = args[coding_arg_default_char];
9850   if (NILP (val))
9851     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9852   else
9853     {
9854       CHECK_CHARACTER (val);
9855       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9856     }
9857
9858   val = args[coding_arg_for_unibyte];
9859   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9860
9861   val = args[coding_arg_plist];
9862   CHECK_LIST (val);
9863   CODING_ATTR_PLIST (attrs) = val;
9864
9865   if (EQ (coding_type, Qcharset))
9866     {
9867       /* Generate a lisp vector of 256 elements.  Each element is nil,
9868          integer, or a list of charset IDs.
9869
9870          If Nth element is nil, the byte code N is invalid in this
9871          coding system.
9872
9873          If Nth element is a number NUM, N is the first byte of a
9874          charset whose ID is NUM.
9875
9876          If Nth element is a list of charset IDs, N is the first byte
9877          of one of them.  The list is sorted by dimensions of the
9878          charsets.  A charset of smaller dimension comes firtst. */
9879       val = Fmake_vector (make_number (256), Qnil);
9880
9881       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9882         {
9883           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9884           int dim = CHARSET_DIMENSION (charset);
9885           int idx = (dim - 1) * 4;
9886
9887           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9888             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9889
9890           for (i = charset->code_space[idx];
9891                i <= charset->code_space[idx + 1]; i++)
9892             {
9893               Lisp_Object tmp, tmp2;
9894               int dim2;
9895
9896               tmp = AREF (val, i);
9897               if (NILP (tmp))
9898                 tmp = XCAR (tail);
9899               else if (NUMBERP (tmp))
9900                 {
9901                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9902                   if (dim < dim2)
9903                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9904                   else
9905                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9906                 }
9907               else
9908                 {
9909                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9910                     {
9911                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9912                       if (dim < dim2)
9913                         break;
9914                     }
9915                   if (NILP (tmp2))
9916                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9917                   else
9918                     {
9919                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9920                       XSETCAR (tmp2, XCAR (tail));
9921                     }
9922                 }
9923               ASET (val, i, tmp);
9924             }
9925         }
9926       ASET (attrs, coding_attr_charset_valids, val);
9927       category = coding_category_charset;
9928     }
9929   else if (EQ (coding_type, Qccl))
9930     {
9931       Lisp_Object valids;
9932
9933       if (nargs < coding_arg_ccl_max)
9934         goto short_args;
9935
9936       val = args[coding_arg_ccl_decoder];
9937       CHECK_CCL_PROGRAM (val);
9938       if (VECTORP (val))
9939         val = Fcopy_sequence (val);
9940       ASET (attrs, coding_attr_ccl_decoder, val);
9941
9942       val = args[coding_arg_ccl_encoder];
9943       CHECK_CCL_PROGRAM (val);
9944       if (VECTORP (val))
9945         val = Fcopy_sequence (val);
9946       ASET (attrs, coding_attr_ccl_encoder, val);
9947
9948       val = args[coding_arg_ccl_valids];
9949       valids = Fmake_string (make_number (256), make_number (0));
9950       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9951         {
9952           int from, to;
9953
9954           val = Fcar (tail);
9955           if (INTEGERP (val))
9956             {
9957               from = to = XINT (val);
9958               if (from < 0 || from > 255)
9959                 args_out_of_range_3 (val, make_number (0), make_number (255));
9960             }
9961           else
9962             {
9963               CHECK_CONS (val);
9964               CHECK_NATNUM_CAR (val);
9965               CHECK_NATNUM_CDR (val);
9966               from = XINT (XCAR (val));
9967               if (from > 255)
9968                 args_out_of_range_3 (XCAR (val),
9969                                      make_number (0), make_number (255));
9970               to = XINT (XCDR (val));
9971               if (to < from || to > 255)
9972                 args_out_of_range_3 (XCDR (val),
9973                                      XCAR (val), make_number (255));
9974             }
9975           for (i = from; i <= to; i++)
9976             SSET (valids, i, 1);
9977         }
9978       ASET (attrs, coding_attr_ccl_valids, valids);
9979
9980       category = coding_category_ccl;
9981     }
9982   else if (EQ (coding_type, Qutf_16))
9983     {
9984       Lisp_Object bom, endian;
9985
9986       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9987
9988       if (nargs < coding_arg_utf16_max)
9989         goto short_args;
9990
9991       bom = args[coding_arg_utf16_bom];
9992       if (! NILP (bom) && ! EQ (bom, Qt))
9993         {
9994           CHECK_CONS (bom);
9995           val = XCAR (bom);
9996           CHECK_CODING_SYSTEM (val);
9997           val = XCDR (bom);
9998           CHECK_CODING_SYSTEM (val);
9999         }
10000       ASET (attrs, coding_attr_utf_bom, bom);
10001
10002       endian = args[coding_arg_utf16_endian];
10003       CHECK_SYMBOL (endian);
10004       if (NILP (endian))
10005         endian = Qbig;
10006       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10007         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10008       ASET (attrs, coding_attr_utf_16_endian, endian);
10009
10010       category = (CONSP (bom)
10011                   ? coding_category_utf_16_auto
10012                   : NILP (bom)
10013                   ? (EQ (endian, Qbig)
10014                      ? coding_category_utf_16_be_nosig
10015                      : coding_category_utf_16_le_nosig)
10016                   : (EQ (endian, Qbig)
10017                      ? coding_category_utf_16_be
10018                      : coding_category_utf_16_le));
10019     }
10020   else if (EQ (coding_type, Qiso_2022))
10021     {
10022       Lisp_Object initial, reg_usage, request, flags;
10023       int i;
10024
10025       if (nargs < coding_arg_iso2022_max)
10026         goto short_args;
10027
10028       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10029       CHECK_VECTOR (initial);
10030       for (i = 0; i < 4; i++)
10031         {
10032           val = Faref (initial, make_number (i));
10033           if (! NILP (val))
10034             {
10035               struct charset *charset;
10036
10037               CHECK_CHARSET_GET_CHARSET (val, charset);
10038               ASET (initial, i, make_number (CHARSET_ID (charset)));
10039               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10040                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10041             }
10042           else
10043             ASET (initial, i, make_number (-1));
10044         }
10045
10046       reg_usage = args[coding_arg_iso2022_reg_usage];
10047       CHECK_CONS (reg_usage);
10048       CHECK_NUMBER_CAR (reg_usage);
10049       CHECK_NUMBER_CDR (reg_usage);
10050
10051       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10052       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
10053         {
10054           int id;
10055           Lisp_Object tmp;
10056
10057           val = Fcar (tail);
10058           CHECK_CONS (val);
10059           tmp = XCAR (val);
10060           CHECK_CHARSET_GET_ID (tmp, id);
10061           CHECK_NATNUM_CDR (val);
10062           if (XINT (XCDR (val)) >= 4)
10063             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
10064           XSETCAR (val, make_number (id));
10065         }
10066
10067       flags = args[coding_arg_iso2022_flags];
10068       CHECK_NATNUM (flags);
10069       i = XINT (flags);
10070       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10071         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
10072
10073       ASET (attrs, coding_attr_iso_initial, initial);
10074       ASET (attrs, coding_attr_iso_usage, reg_usage);
10075       ASET (attrs, coding_attr_iso_request, request);
10076       ASET (attrs, coding_attr_iso_flags, flags);
10077       setup_iso_safe_charsets (attrs);
10078
10079       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10080         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10081                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10082                     ? coding_category_iso_7_else
10083                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10084                     ? coding_category_iso_7
10085                     : coding_category_iso_7_tight);
10086       else
10087         {
10088           int id = XINT (AREF (initial, 1));
10089
10090           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10091                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10092                        || id < 0)
10093                       ? coding_category_iso_8_else
10094                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10095                       ? coding_category_iso_8_1
10096                       : coding_category_iso_8_2);
10097         }
10098       if (category != coding_category_iso_8_1
10099           && category != coding_category_iso_8_2)
10100         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10101     }
10102   else if (EQ (coding_type, Qemacs_mule))
10103     {
10104       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10105         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10106       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10107       category = coding_category_emacs_mule;
10108     }
10109   else if (EQ (coding_type, Qshift_jis))
10110     {
10111
10112       struct charset *charset;
10113
10114       if (XINT (Flength (charset_list)) != 3
10115           && XINT (Flength (charset_list)) != 4)
10116         error ("There should be three or four charsets");
10117
10118       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10119       if (CHARSET_DIMENSION (charset) != 1)
10120         error ("Dimension of charset %s is not one",
10121                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10122       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10123         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10124
10125       charset_list = XCDR (charset_list);
10126       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10127       if (CHARSET_DIMENSION (charset) != 1)
10128         error ("Dimension of charset %s is not one",
10129                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10130
10131       charset_list = XCDR (charset_list);
10132       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10133       if (CHARSET_DIMENSION (charset) != 2)
10134         error ("Dimension of charset %s is not two",
10135                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10136
10137       charset_list = XCDR (charset_list);
10138       if (! NILP (charset_list))
10139         {
10140           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10141           if (CHARSET_DIMENSION (charset) != 2)
10142             error ("Dimension of charset %s is not two",
10143                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10144         }
10145
10146       category = coding_category_sjis;
10147       Vsjis_coding_system = name;
10148     }
10149   else if (EQ (coding_type, Qbig5))
10150     {
10151       struct charset *charset;
10152
10153       if (XINT (Flength (charset_list)) != 2)
10154         error ("There should be just two charsets");
10155
10156       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10157       if (CHARSET_DIMENSION (charset) != 1)
10158         error ("Dimension of charset %s is not one",
10159                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10160       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10161         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10162
10163       charset_list = XCDR (charset_list);
10164       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10165       if (CHARSET_DIMENSION (charset) != 2)
10166         error ("Dimension of charset %s is not two",
10167                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10168
10169       category = coding_category_big5;
10170       Vbig5_coding_system = name;
10171     }
10172   else if (EQ (coding_type, Qraw_text))
10173     {
10174       category = coding_category_raw_text;
10175       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10176     }
10177   else if (EQ (coding_type, Qutf_8))
10178     {
10179       Lisp_Object bom;
10180
10181       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10182
10183       if (nargs < coding_arg_utf8_max)
10184         goto short_args;
10185
10186       bom = args[coding_arg_utf8_bom];
10187       if (! NILP (bom) && ! EQ (bom, Qt))
10188         {
10189           CHECK_CONS (bom);
10190           val = XCAR (bom);
10191           CHECK_CODING_SYSTEM (val);
10192           val = XCDR (bom);
10193           CHECK_CODING_SYSTEM (val);
10194         }
10195       ASET (attrs, coding_attr_utf_bom, bom);
10196
10197       category = (CONSP (bom) ? coding_category_utf_8_auto
10198                   : NILP (bom) ? coding_category_utf_8_nosig
10199                   : coding_category_utf_8_sig);
10200     }
10201   else if (EQ (coding_type, Qundecided))
10202     category = coding_category_undecided;
10203   else
10204     error ("Invalid coding system type: %s",
10205            SDATA (SYMBOL_NAME (coding_type)));
10206
10207   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10208   CODING_ATTR_PLIST (attrs)
10209     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10210                                 CODING_ATTR_PLIST (attrs)));
10211   CODING_ATTR_PLIST (attrs)
10212     = Fcons (QCascii_compatible_p,
10213              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10214                     CODING_ATTR_PLIST (attrs)));
10215
10216   eol_type = args[coding_arg_eol_type];
10217   if (! NILP (eol_type)
10218       && ! EQ (eol_type, Qunix)
10219       && ! EQ (eol_type, Qdos)
10220       && ! EQ (eol_type, Qmac))
10221     error ("Invalid eol-type");
10222
10223   aliases = Fcons (name, Qnil);
10224
10225   if (NILP (eol_type))
10226     {
10227       eol_type = make_subsidiaries (name);
10228       for (i = 0; i < 3; i++)
10229         {
10230           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10231
10232           this_name = AREF (eol_type, i);
10233           this_aliases = Fcons (this_name, Qnil);
10234           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10235           this_spec = Fmake_vector (make_number (3), attrs);
10236           ASET (this_spec, 1, this_aliases);
10237           ASET (this_spec, 2, this_eol_type);
10238           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10239           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10240           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10241           if (NILP (val))
10242             Vcoding_system_alist
10243               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10244                        Vcoding_system_alist);
10245         }
10246     }
10247
10248   spec_vec = Fmake_vector (make_number (3), attrs);
10249   ASET (spec_vec, 1, aliases);
10250   ASET (spec_vec, 2, eol_type);
10251
10252   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10253   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10254   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10255   if (NILP (val))
10256     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10257                                   Vcoding_system_alist);
10258
10259   {
10260     int id = coding_categories[category].id;
10261
10262     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10263       setup_coding_system (name, &coding_categories[category]);
10264   }
10265
10266   return Qnil;
10267
10268  short_args:
10269   return Fsignal (Qwrong_number_of_arguments,
10270                   Fcons (intern ("define-coding-system-internal"),
10271                          make_number (nargs)));
10272 }
10273
10274
10275 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10276        3, 3, 0,
10277        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10278   (coding_system, prop, val)
10279      Lisp_Object coding_system, prop, val;
10280 {
10281   Lisp_Object spec, attrs;
10282
10283   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10284   attrs = AREF (spec, 0);
10285   if (EQ (prop, QCmnemonic))
10286     {
10287       if (! STRINGP (val))
10288         CHECK_CHARACTER (val);
10289       CODING_ATTR_MNEMONIC (attrs) = val;
10290     }
10291   else if (EQ (prop, QCdefault_char))
10292     {
10293       if (NILP (val))
10294         val = make_number (' ');
10295       else
10296         CHECK_CHARACTER (val);
10297       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10298     }
10299   else if (EQ (prop, QCdecode_translation_table))
10300     {
10301       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10302         CHECK_SYMBOL (val);
10303       CODING_ATTR_DECODE_TBL (attrs) = val;
10304     }
10305   else if (EQ (prop, QCencode_translation_table))
10306     {
10307       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10308         CHECK_SYMBOL (val);
10309       CODING_ATTR_ENCODE_TBL (attrs) = val;
10310     }
10311   else if (EQ (prop, QCpost_read_conversion))
10312     {
10313       CHECK_SYMBOL (val);
10314       CODING_ATTR_POST_READ (attrs) = val;
10315     }
10316   else if (EQ (prop, QCpre_write_conversion))
10317     {
10318       CHECK_SYMBOL (val);
10319       CODING_ATTR_PRE_WRITE (attrs) = val;
10320     }
10321   else if (EQ (prop, QCascii_compatible_p))
10322     {
10323       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10324     }
10325
10326   CODING_ATTR_PLIST (attrs)
10327     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10328   return val;
10329 }
10330
10331
10332 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10333        Sdefine_coding_system_alias, 2, 2, 0,
10334        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10335      (alias, coding_system)
10336      Lisp_Object alias, coding_system;
10337 {
10338   Lisp_Object spec, aliases, eol_type, val;
10339
10340   CHECK_SYMBOL (alias);
10341   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10342   aliases = AREF (spec, 1);
10343   /* ALIASES should be a list of length more than zero, and the first
10344      element is a base coding system.  Append ALIAS at the tail of the
10345      list.  */
10346   while (!NILP (XCDR (aliases)))
10347     aliases = XCDR (aliases);
10348   XSETCDR (aliases, Fcons (alias, Qnil));
10349
10350   eol_type = AREF (spec, 2);
10351   if (VECTORP (eol_type))
10352     {
10353       Lisp_Object subsidiaries;
10354       int i;
10355
10356       subsidiaries = make_subsidiaries (alias);
10357       for (i = 0; i < 3; i++)
10358         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10359                                      AREF (eol_type, i));
10360     }
10361
10362   Fputhash (alias, spec, Vcoding_system_hash_table);
10363   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10364   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10365   if (NILP (val))
10366     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10367                                   Vcoding_system_alist);
10368
10369   return Qnil;
10370 }
10371
10372 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10373        1, 1, 0,
10374        doc: /* Return the base of CODING-SYSTEM.
10375 Any alias or subsidiary coding system is not a base coding system.  */)
10376   (coding_system)
10377      Lisp_Object coding_system;
10378 {
10379   Lisp_Object spec, attrs;
10380
10381   if (NILP (coding_system))
10382     return (Qno_conversion);
10383   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10384   attrs = AREF (spec, 0);
10385   return CODING_ATTR_BASE_NAME (attrs);
10386 }
10387
10388 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10389        1, 1, 0,
10390        doc: "Return the property list of CODING-SYSTEM.")
10391      (coding_system)
10392      Lisp_Object coding_system;
10393 {
10394   Lisp_Object spec, attrs;
10395
10396   if (NILP (coding_system))
10397     coding_system = Qno_conversion;
10398   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10399   attrs = AREF (spec, 0);
10400   return CODING_ATTR_PLIST (attrs);
10401 }
10402
10403
10404 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10405        1, 1, 0,
10406        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10407      (coding_system)
10408      Lisp_Object coding_system;
10409 {
10410   Lisp_Object spec;
10411
10412   if (NILP (coding_system))
10413     coding_system = Qno_conversion;
10414   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10415   return AREF (spec, 1);
10416 }
10417
10418 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10419        Scoding_system_eol_type, 1, 1, 0,
10420        doc: /* Return eol-type of CODING-SYSTEM.
10421 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10422
10423 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10424 and CR respectively.
10425
10426 A vector value indicates that a format of end-of-line should be
10427 detected automatically.  Nth element of the vector is the subsidiary
10428 coding system whose eol-type is N.  */)
10429      (coding_system)
10430      Lisp_Object coding_system;
10431 {
10432   Lisp_Object spec, eol_type;
10433   int n;
10434
10435   if (NILP (coding_system))
10436     coding_system = Qno_conversion;
10437   if (! CODING_SYSTEM_P (coding_system))
10438     return Qnil;
10439   spec = CODING_SYSTEM_SPEC (coding_system);
10440   eol_type = AREF (spec, 2);
10441   if (VECTORP (eol_type))
10442     return Fcopy_sequence (eol_type);
10443   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10444   return make_number (n);
10445 }
10446
10447 #endif /* emacs */
10448
10449 \f
10450 /*** 9. Post-amble ***/
10451
10452 void
10453 init_coding_once ()
10454 {
10455   int i;
10456
10457   for (i = 0; i < coding_category_max; i++)
10458     {
10459       coding_categories[i].id = -1;
10460       coding_priorities[i] = i;
10461     }
10462
10463   /* ISO2022 specific initialize routine.  */
10464   for (i = 0; i < 0x20; i++)
10465     iso_code_class[i] = ISO_control_0;
10466   for (i = 0x21; i < 0x7F; i++)
10467     iso_code_class[i] = ISO_graphic_plane_0;
10468   for (i = 0x80; i < 0xA0; i++)
10469     iso_code_class[i] = ISO_control_1;
10470   for (i = 0xA1; i < 0xFF; i++)
10471     iso_code_class[i] = ISO_graphic_plane_1;
10472   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10473   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10474   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10475   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10476   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10477   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10478   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10479   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10480   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10481
10482   for (i = 0; i < 256; i++)
10483     {
10484       emacs_mule_bytes[i] = 1;
10485     }
10486   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10487   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10488   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10489   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10490 }
10491
10492 #ifdef emacs
10493
10494 void
10495 syms_of_coding ()
10496 {
10497   staticpro (&Vcoding_system_hash_table);
10498   {
10499     Lisp_Object args[2];
10500     args[0] = QCtest;
10501     args[1] = Qeq;
10502     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10503   }
10504
10505   staticpro (&Vsjis_coding_system);
10506   Vsjis_coding_system = Qnil;
10507
10508   staticpro (&Vbig5_coding_system);
10509   Vbig5_coding_system = Qnil;
10510
10511   staticpro (&Vcode_conversion_reused_workbuf);
10512   Vcode_conversion_reused_workbuf = Qnil;
10513
10514   staticpro (&Vcode_conversion_workbuf_name);
10515   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10516
10517   reused_workbuf_in_use = 0;
10518
10519   DEFSYM (Qcharset, "charset");
10520   DEFSYM (Qtarget_idx, "target-idx");
10521   DEFSYM (Qcoding_system_history, "coding-system-history");
10522   Fset (Qcoding_system_history, Qnil);
10523
10524   /* Target FILENAME is the first argument.  */
10525   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10526   /* Target FILENAME is the third argument.  */
10527   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10528
10529   DEFSYM (Qcall_process, "call-process");
10530   /* Target PROGRAM is the first argument.  */
10531   Fput (Qcall_process, Qtarget_idx, make_number (0));
10532
10533   DEFSYM (Qcall_process_region, "call-process-region");
10534   /* Target PROGRAM is the third argument.  */
10535   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10536
10537   DEFSYM (Qstart_process, "start-process");
10538   /* Target PROGRAM is the third argument.  */
10539   Fput (Qstart_process, Qtarget_idx, make_number (2));
10540
10541   DEFSYM (Qopen_network_stream, "open-network-stream");
10542   /* Target SERVICE is the fourth argument.  */
10543   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10544
10545   DEFSYM (Qcoding_system, "coding-system");
10546   DEFSYM (Qcoding_aliases, "coding-aliases");
10547
10548   DEFSYM (Qeol_type, "eol-type");
10549   DEFSYM (Qunix, "unix");
10550   DEFSYM (Qdos, "dos");
10551
10552   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10553   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10554   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10555   DEFSYM (Qdefault_char, "default-char");
10556   DEFSYM (Qundecided, "undecided");
10557   DEFSYM (Qno_conversion, "no-conversion");
10558   DEFSYM (Qraw_text, "raw-text");
10559
10560   DEFSYM (Qiso_2022, "iso-2022");
10561
10562   DEFSYM (Qutf_8, "utf-8");
10563   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10564
10565   DEFSYM (Qutf_16, "utf-16");
10566   DEFSYM (Qbig, "big");
10567   DEFSYM (Qlittle, "little");
10568
10569   DEFSYM (Qshift_jis, "shift-jis");
10570   DEFSYM (Qbig5, "big5");
10571
10572   DEFSYM (Qcoding_system_p, "coding-system-p");
10573
10574   DEFSYM (Qcoding_system_error, "coding-system-error");
10575   Fput (Qcoding_system_error, Qerror_conditions,
10576         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10577   Fput (Qcoding_system_error, Qerror_message,
10578         make_pure_c_string ("Invalid coding system"));
10579
10580   /* Intern this now in case it isn't already done.
10581      Setting this variable twice is harmless.
10582      But don't staticpro it here--that is done in alloc.c.  */
10583   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10584
10585   DEFSYM (Qtranslation_table, "translation-table");
10586   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10587   DEFSYM (Qtranslation_table_id, "translation-table-id");
10588   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10589   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10590
10591   DEFSYM (Qvalid_codes, "valid-codes");
10592
10593   DEFSYM (Qemacs_mule, "emacs-mule");
10594
10595   DEFSYM (QCcategory, ":category");
10596   DEFSYM (QCmnemonic, ":mnemonic");
10597   DEFSYM (QCdefault_char, ":default-char");
10598   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10599   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10600   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10601   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10602   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10603
10604   Vcoding_category_table
10605     = Fmake_vector (make_number (coding_category_max), Qnil);
10606   staticpro (&Vcoding_category_table);
10607   /* Followings are target of code detection.  */
10608   ASET (Vcoding_category_table, coding_category_iso_7,
10609         intern_c_string ("coding-category-iso-7"));
10610   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10611         intern_c_string ("coding-category-iso-7-tight"));
10612   ASET (Vcoding_category_table, coding_category_iso_8_1,
10613         intern_c_string ("coding-category-iso-8-1"));
10614   ASET (Vcoding_category_table, coding_category_iso_8_2,
10615         intern_c_string ("coding-category-iso-8-2"));
10616   ASET (Vcoding_category_table, coding_category_iso_7_else,
10617         intern_c_string ("coding-category-iso-7-else"));
10618   ASET (Vcoding_category_table, coding_category_iso_8_else,
10619         intern_c_string ("coding-category-iso-8-else"));
10620   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10621         intern_c_string ("coding-category-utf-8-auto"));
10622   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10623         intern_c_string ("coding-category-utf-8"));
10624   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10625         intern_c_string ("coding-category-utf-8-sig"));
10626   ASET (Vcoding_category_table, coding_category_utf_16_be,
10627         intern_c_string ("coding-category-utf-16-be"));
10628   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10629         intern_c_string ("coding-category-utf-16-auto"));
10630   ASET (Vcoding_category_table, coding_category_utf_16_le,
10631         intern_c_string ("coding-category-utf-16-le"));
10632   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10633         intern_c_string ("coding-category-utf-16-be-nosig"));
10634   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10635         intern_c_string ("coding-category-utf-16-le-nosig"));
10636   ASET (Vcoding_category_table, coding_category_charset,
10637         intern_c_string ("coding-category-charset"));
10638   ASET (Vcoding_category_table, coding_category_sjis,
10639         intern_c_string ("coding-category-sjis"));
10640   ASET (Vcoding_category_table, coding_category_big5,
10641         intern_c_string ("coding-category-big5"));
10642   ASET (Vcoding_category_table, coding_category_ccl,
10643         intern_c_string ("coding-category-ccl"));
10644   ASET (Vcoding_category_table, coding_category_emacs_mule,
10645         intern_c_string ("coding-category-emacs-mule"));
10646   /* Followings are NOT target of code detection.  */
10647   ASET (Vcoding_category_table, coding_category_raw_text,
10648         intern_c_string ("coding-category-raw-text"));
10649   ASET (Vcoding_category_table, coding_category_undecided,
10650         intern_c_string ("coding-category-undecided"));
10651
10652   DEFSYM (Qinsufficient_source, "insufficient-source");
10653   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10654   DEFSYM (Qinvalid_source, "invalid-source");
10655   DEFSYM (Qinterrupted, "interrupted");
10656   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10657   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10658
10659   defsubr (&Scoding_system_p);
10660   defsubr (&Sread_coding_system);
10661   defsubr (&Sread_non_nil_coding_system);
10662   defsubr (&Scheck_coding_system);
10663   defsubr (&Sdetect_coding_region);
10664   defsubr (&Sdetect_coding_string);
10665   defsubr (&Sfind_coding_systems_region_internal);
10666   defsubr (&Sunencodable_char_position);
10667   defsubr (&Scheck_coding_systems_region);
10668   defsubr (&Sdecode_coding_region);
10669   defsubr (&Sencode_coding_region);
10670   defsubr (&Sdecode_coding_string);
10671   defsubr (&Sencode_coding_string);
10672   defsubr (&Sdecode_sjis_char);
10673   defsubr (&Sencode_sjis_char);
10674   defsubr (&Sdecode_big5_char);
10675   defsubr (&Sencode_big5_char);
10676   defsubr (&Sset_terminal_coding_system_internal);
10677   defsubr (&Sset_safe_terminal_coding_system_internal);
10678   defsubr (&Sterminal_coding_system);
10679   defsubr (&Sset_keyboard_coding_system_internal);
10680   defsubr (&Skeyboard_coding_system);
10681   defsubr (&Sfind_operation_coding_system);
10682   defsubr (&Sset_coding_system_priority);
10683   defsubr (&Sdefine_coding_system_internal);
10684   defsubr (&Sdefine_coding_system_alias);
10685   defsubr (&Scoding_system_put);
10686   defsubr (&Scoding_system_base);
10687   defsubr (&Scoding_system_plist);
10688   defsubr (&Scoding_system_aliases);
10689   defsubr (&Scoding_system_eol_type);
10690   defsubr (&Scoding_system_priority_list);
10691
10692   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10693                doc: /* List of coding systems.
10694
10695 Do not alter the value of this variable manually.  This variable should be
10696 updated by the functions `define-coding-system' and
10697 `define-coding-system-alias'.  */);
10698   Vcoding_system_list = Qnil;
10699
10700   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10701                doc: /* Alist of coding system names.
10702 Each element is one element list of coding system name.
10703 This variable is given to `completing-read' as COLLECTION argument.
10704
10705 Do not alter the value of this variable manually.  This variable should be
10706 updated by the functions `make-coding-system' and
10707 `define-coding-system-alias'.  */);
10708   Vcoding_system_alist = Qnil;
10709
10710   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10711                doc: /* List of coding-categories (symbols) ordered by priority.
10712
10713 On detecting a coding system, Emacs tries code detection algorithms
10714 associated with each coding-category one by one in this order.  When
10715 one algorithm agrees with a byte sequence of source text, the coding
10716 system bound to the corresponding coding-category is selected.
10717
10718 Don't modify this variable directly, but use `set-coding-priority'.  */);
10719   {
10720     int i;
10721
10722     Vcoding_category_list = Qnil;
10723     for (i = coding_category_max - 1; i >= 0; i--)
10724       Vcoding_category_list
10725         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10726                  Vcoding_category_list);
10727   }
10728
10729   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10730                doc: /* Specify the coding system for read operations.
10731 It is useful to bind this variable with `let', but do not set it globally.
10732 If the value is a coding system, it is used for decoding on read operation.
10733 If not, an appropriate element is used from one of the coding system alists.
10734 There are three such tables: `file-coding-system-alist',
10735 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10736   Vcoding_system_for_read = Qnil;
10737
10738   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10739                doc: /* Specify the coding system for write operations.
10740 Programs bind this variable with `let', but you should not set it globally.
10741 If the value is a coding system, it is used for encoding of output,
10742 when writing it to a file and when sending it to a file or subprocess.
10743
10744 If this does not specify a coding system, an appropriate element
10745 is used from one of the coding system alists.
10746 There are three such tables: `file-coding-system-alist',
10747 `process-coding-system-alist', and `network-coding-system-alist'.
10748 For output to files, if the above procedure does not specify a coding system,
10749 the value of `buffer-file-coding-system' is used.  */);
10750   Vcoding_system_for_write = Qnil;
10751
10752   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10753                doc: /*
10754 Coding system used in the latest file or process I/O.  */);
10755   Vlast_coding_system_used = Qnil;
10756
10757   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10758                doc: /*
10759 Error status of the last code conversion.
10760
10761 When an error was detected in the last code conversion, this variable
10762 is set to one of the following symbols.
10763   `insufficient-source'
10764   `inconsistent-eol'
10765   `invalid-source'
10766   `interrupted'
10767   `insufficient-memory'
10768 When no error was detected, the value doesn't change.  So, to check
10769 the error status of a code conversion by this variable, you must
10770 explicitly set this variable to nil before performing code
10771 conversion.  */);
10772   Vlast_code_conversion_error = Qnil;
10773
10774   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10775                doc: /*
10776 *Non-nil means always inhibit code conversion of end-of-line format.
10777 See info node `Coding Systems' and info node `Text and Binary' concerning
10778 such conversion.  */);
10779   inhibit_eol_conversion = 0;
10780
10781   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10782                doc: /*
10783 Non-nil means process buffer inherits coding system of process output.
10784 Bind it to t if the process output is to be treated as if it were a file
10785 read from some filesystem.  */);
10786   inherit_process_coding_system = 0;
10787
10788   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10789                doc: /*
10790 Alist to decide a coding system to use for a file I/O operation.
10791 The format is ((PATTERN . VAL) ...),
10792 where PATTERN is a regular expression matching a file name,
10793 VAL is a coding system, a cons of coding systems, or a function symbol.
10794 If VAL is a coding system, it is used for both decoding and encoding
10795 the file contents.
10796 If VAL is a cons of coding systems, the car part is used for decoding,
10797 and the cdr part is used for encoding.
10798 If VAL is a function symbol, the function must return a coding system
10799 or a cons of coding systems which are used as above.  The function is
10800 called with an argument that is a list of the arguments with which
10801 `find-operation-coding-system' was called.  If the function can't decide
10802 a coding system, it can return `undecided' so that the normal
10803 code-detection is performed.
10804
10805 See also the function `find-operation-coding-system'
10806 and the variable `auto-coding-alist'.  */);
10807   Vfile_coding_system_alist = Qnil;
10808
10809   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10810                doc: /*
10811 Alist to decide a coding system to use for a process I/O operation.
10812 The format is ((PATTERN . VAL) ...),
10813 where PATTERN is a regular expression matching a program name,
10814 VAL is a coding system, a cons of coding systems, or a function symbol.
10815 If VAL is a coding system, it is used for both decoding what received
10816 from the program and encoding what sent to the program.
10817 If VAL is a cons of coding systems, the car part is used for decoding,
10818 and the cdr part is used for encoding.
10819 If VAL is a function symbol, the function must return a coding system
10820 or a cons of coding systems which are used as above.
10821
10822 See also the function `find-operation-coding-system'.  */);
10823   Vprocess_coding_system_alist = Qnil;
10824
10825   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10826                doc: /*
10827 Alist to decide a coding system to use for a network I/O operation.
10828 The format is ((PATTERN . VAL) ...),
10829 where PATTERN is a regular expression matching a network service name
10830 or is a port number to connect to,
10831 VAL is a coding system, a cons of coding systems, or a function symbol.
10832 If VAL is a coding system, it is used for both decoding what received
10833 from the network stream and encoding what sent to the network stream.
10834 If VAL is a cons of coding systems, the car part is used for decoding,
10835 and the cdr part is used for encoding.
10836 If VAL is a function symbol, the function must return a coding system
10837 or a cons of coding systems which are used as above.
10838
10839 See also the function `find-operation-coding-system'.  */);
10840   Vnetwork_coding_system_alist = Qnil;
10841
10842   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10843                doc: /* Coding system to use with system messages.
10844 Also used for decoding keyboard input on X Window system.  */);
10845   Vlocale_coding_system = Qnil;
10846
10847   /* The eol mnemonics are reset in startup.el system-dependently.  */
10848   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10849                doc: /*
10850 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10851   eol_mnemonic_unix = make_pure_c_string (":");
10852
10853   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10854                doc: /*
10855 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10856   eol_mnemonic_dos = make_pure_c_string ("\\");
10857
10858   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10859                doc: /*
10860 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10861   eol_mnemonic_mac = make_pure_c_string ("/");
10862
10863   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10864                doc: /*
10865 *String displayed in mode line when end-of-line format is not yet determined.  */);
10866   eol_mnemonic_undecided = make_pure_c_string (":");
10867
10868   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10869                doc: /*
10870 *Non-nil enables character translation while encoding and decoding.  */);
10871   Venable_character_translation = Qt;
10872
10873   DEFVAR_LISP ("standard-translation-table-for-decode",
10874                &Vstandard_translation_table_for_decode,
10875                doc: /* Table for translating characters while decoding.  */);
10876   Vstandard_translation_table_for_decode = Qnil;
10877
10878   DEFVAR_LISP ("standard-translation-table-for-encode",
10879                &Vstandard_translation_table_for_encode,
10880                doc: /* Table for translating characters while encoding.  */);
10881   Vstandard_translation_table_for_encode = Qnil;
10882
10883   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10884                doc: /* Alist of charsets vs revision numbers.
10885 While encoding, if a charset (car part of an element) is found,
10886 designate it with the escape sequence identifying revision (cdr part
10887 of the element).  */);
10888   Vcharset_revision_table = Qnil;
10889
10890   DEFVAR_LISP ("default-process-coding-system",
10891                &Vdefault_process_coding_system,
10892                doc: /* Cons of coding systems used for process I/O by default.
10893 The car part is used for decoding a process output,
10894 the cdr part is used for encoding a text to be sent to a process.  */);
10895   Vdefault_process_coding_system = Qnil;
10896
10897   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10898                doc: /*
10899 Table of extra Latin codes in the range 128..159 (inclusive).
10900 This is a vector of length 256.
10901 If Nth element is non-nil, the existence of code N in a file
10902 \(or output of subprocess) doesn't prevent it to be detected as
10903 a coding system of ISO 2022 variant which has a flag
10904 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10905 or reading output of a subprocess.
10906 Only 128th through 159th elements have a meaning.  */);
10907   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10908
10909   DEFVAR_LISP ("select-safe-coding-system-function",
10910                &Vselect_safe_coding_system_function,
10911                doc: /*
10912 Function to call to select safe coding system for encoding a text.
10913
10914 If set, this function is called to force a user to select a proper
10915 coding system which can encode the text in the case that a default
10916 coding system used in each operation can't encode the text.  The
10917 function should take care that the buffer is not modified while
10918 the coding system is being selected.
10919
10920 The default value is `select-safe-coding-system' (which see).  */);
10921   Vselect_safe_coding_system_function = Qnil;
10922
10923   DEFVAR_BOOL ("coding-system-require-warning",
10924                &coding_system_require_warning,
10925                doc: /* Internal use only.
10926 If non-nil, on writing a file, `select-safe-coding-system-function' is
10927 called even if `coding-system-for-write' is non-nil.  The command
10928 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10929   coding_system_require_warning = 0;
10930
10931
10932   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10933                &inhibit_iso_escape_detection,
10934                doc: /*
10935 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10936
10937 When Emacs reads text, it tries to detect how the text is encoded.
10938 This code detection is sensitive to escape sequences.  If Emacs sees
10939 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10940 of the ISO2022 encodings, and decodes text by the corresponding coding
10941 system (e.g. `iso-2022-7bit').
10942
10943 However, there may be a case that you want to read escape sequences in
10944 a file as is.  In such a case, you can set this variable to non-nil.
10945 Then the code detection will ignore any escape sequences, and no text is
10946 detected as encoded in some ISO-2022 encoding.  The result is that all
10947 escape sequences become visible in a buffer.
10948
10949 The default value is nil, and it is strongly recommended not to change
10950 it.  That is because many Emacs Lisp source files that contain
10951 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10952 in Emacs's distribution, and they won't be decoded correctly on
10953 reading if you suppress escape sequence detection.
10954
10955 The other way to read escape sequences in a file without decoding is
10956 to explicitly specify some coding system that doesn't use ISO-2022
10957 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10958   inhibit_iso_escape_detection = 0;
10959
10960   DEFVAR_BOOL ("inhibit-null-byte-detection",
10961                &inhibit_null_byte_detection,
10962                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10963 By default, Emacs treats it as binary data, and does not attempt to
10964 decode it.  The effect is as if you specified `no-conversion' for
10965 reading that text.
10966
10967 Set this to non-nil when a regular text happens to include null bytes.
10968 Examples are Index nodes of Info files and null-byte delimited output
10969 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10970 decode text as usual.  */);
10971   inhibit_null_byte_detection = 0;
10972
10973   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10974                doc: /* Char table for translating self-inserting characters.
10975 This is applied to the result of input methods, not their input.
10976 See also `keyboard-translate-table'.
10977
10978 Use of this variable for character code unification was rendered
10979 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10980 internal character representation.  */);
10981     Vtranslation_table_for_input = Qnil;
10982
10983   {
10984     Lisp_Object args[coding_arg_max];
10985     Lisp_Object plist[16];
10986     int i;
10987
10988     for (i = 0; i < coding_arg_max; i++)
10989       args[i] = Qnil;
10990
10991     plist[0] = intern_c_string (":name");
10992     plist[1] = args[coding_arg_name] = Qno_conversion;
10993     plist[2] = intern_c_string (":mnemonic");
10994     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10995     plist[4] = intern_c_string (":coding-type");
10996     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10997     plist[6] = intern_c_string (":ascii-compatible-p");
10998     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10999     plist[8] = intern_c_string (":default-char");
11000     plist[9] = args[coding_arg_default_char] = make_number (0);
11001     plist[10] = intern_c_string (":for-unibyte");
11002     plist[11] = args[coding_arg_for_unibyte] = Qt;
11003     plist[12] = intern_c_string (":docstring");
11004     plist[13] = make_pure_c_string ("Do no conversion.\n\
11005 \n\
11006 When you visit a file with this coding, the file is read into a\n\
11007 unibyte buffer as is, thus each byte of a file is treated as a\n\
11008 character.");
11009     plist[14] = intern_c_string (":eol-type");
11010     plist[15] = args[coding_arg_eol_type] = Qunix;
11011     args[coding_arg_plist] = Flist (16, plist);
11012     Fdefine_coding_system_internal (coding_arg_max, args);
11013
11014     plist[1] = args[coding_arg_name] = Qundecided;
11015     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11016     plist[5] = args[coding_arg_coding_type] = Qundecided;
11017     /* This is already set.
11018        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11019     plist[8] = intern_c_string (":charset-list");
11020     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11021     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11022     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11023     plist[15] = args[coding_arg_eol_type] = Qnil;
11024     args[coding_arg_plist] = Flist (16, plist);
11025     Fdefine_coding_system_internal (coding_arg_max, args);
11026   }
11027
11028   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11029
11030   {
11031     int i;
11032
11033     for (i = 0; i < coding_category_max; i++)
11034       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11035   }
11036 #if defined (MSDOS) || defined (WINDOWSNT)
11037   system_eol_type = Qdos;
11038 #else
11039   system_eol_type = Qunix;
11040 #endif
11041   staticpro (&system_eol_type);
11042 }
11043
11044 char *
11045 emacs_strerror (error_number)
11046      int error_number;
11047 {
11048   char *str;
11049
11050   synchronize_system_messages_locale ();
11051   str = strerror (error_number);
11052
11053   if (! NILP (Vlocale_coding_system))
11054     {
11055       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11056                                                       Vlocale_coding_system,
11057                                                       0);
11058       str = (char *) SDATA (dec);
11059     }
11060
11061   return str;
11062 }
11063
11064 #endif /* emacs */
11065
11066 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
11067    (do not change this comment) */