src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 3, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  24 Boston, MA 02110-1301, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-mule) handlers
  31   3. ISO2022 handlers
  32   4. Shift-JIS and BIG5 handlers
  33   5. CCL handlers
  34   6. End-of-line handlers
  35   7. C library functions
  36   8. Emacs Lisp library functions
  37   9. Post-amble
  38
  39 */
  40
  41 /*** 0. General comments ***/
  42
  43
  44 /*** GENERAL NOTE on CODING SYSTEMS ***
  45
  46   A coding system is an encoding mechanism for one or more character
  47   sets.  Here's a list of coding systems which Emacs can handle.  When
  48   we say "decode", it means converting some other coding system to
  49   Emacs' internal format (emacs-mule), and when we say "encode",
  50   it means converting the coding system emacs-mule to some other
  51   coding system.
  52
  53   0. Emacs' internal format (emacs-mule)
  54
  55   Emacs itself holds a multi-lingual character in buffers and strings
  56   in a special format.  Details are described in section 2.
  57
  58   1. ISO2022
  59
  60   The most famous coding system for multiple character sets.  X's
  61   Compound Text, various EUCs (Extended Unix Code), and coding
  62   systems used in Internet communication such as ISO-2022-JP are
  63   all variants of ISO2022.  Details are described in section 3.
  64
  65   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  66
  67   A coding system to encode character sets: ASCII, JISX0201, and
  68   JISX0208.  Widely used for PC's in Japan.  Details are described in
  69   section 4.
  70
  71   3. BIG5
  72
  73   A coding system to encode the character sets ASCII and Big5.  Widely
  74   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  75   described in section 4.  In this file, when we write "BIG5"
  76   (all uppercase), we mean the coding system, and when we write
  77   "Big5" (capitalized), we mean the character set.
  78
  79   4. Raw text
  80
  81   A coding system for text containing random 8-bit code.  Emacs does
  82   no code conversion on such text except for end-of-line format.
  83
  84   5. Other
  85
  86   If a user wants to read/write text encoded in a coding system not
  87   listed above, he can supply a decoder and an encoder for it as CCL
  88   (Code Conversion Language) programs.  Emacs executes the CCL program
  89   while reading/writing.
  90
  91   Emacs represents a coding system by a Lisp symbol that has a property
  92   `coding-system'.  But, before actually using the coding system, the
  93   information about it is set in a structure of type `struct
  94   coding_system' for rapid processing.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  99
 100   How end-of-line of text is encoded depends on the operating system.
 101   For instance, Unix's format is just one byte of `line-feed' code,
 102   whereas DOS's format is two-byte sequence of `carriage-return' and
 103   `line-feed' codes.  MacOS's format is usually one byte of
 104   `carriage-return'.
 105
 106   Since text character encoding and end-of-line encoding are
 107   independent, any coding system described above can have any
 108   end-of-line format.  So Emacs has information about end-of-line
 109   format in each coding-system.  See section 6 for more details.
 110
 111 */
 112
 113 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 114
 115   These functions check if a text between SRC and SRC_END is encoded
 116   in the coding system category XXX.  Each returns an integer value in
 117   which appropriate flag bits for the category XXX are set.  The flag
 118   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 119   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 120   of the range 0x80..0x9F are in multibyte form.  */
 121 #if 0
 122 int
 123 detect_coding_emacs_mule (src, src_end, multibytep)
 124      unsigned char *src, *src_end;
 125      int multibytep;
 126 {
 127   ...
 128 }
 129 #endif
 130
 131 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 132
 133   These functions decode SRC_BYTES length of unibyte text at SOURCE
 134   encoded in CODING to Emacs' internal format.  The resulting
 135   multibyte text goes to a place pointed to by DESTINATION, the length
 136   of which should not exceed DST_BYTES.
 137
 138   These functions set the information about original and decoded texts
 139   in the members `produced', `produced_char', `consumed', and
 140   `consumed_char' of the structure *CODING.  They also set the member
 141   `result' to one of CODING_FINISH_XXX indicating how the decoding
 142   finished.
 143
 144   DST_BYTES zero means that the source area and destination area are
 145   overlapped, which means that we can produce a decoded text until it
 146   reaches the head of the not-yet-decoded source text.
 147
 148   Below is a template for these functions.  */
 149 #if 0
 150 static void
 151 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 152      struct coding_system *coding;
 153      const unsigned char *source;
 154      unsigned char *destination;
 155      int src_bytes, dst_bytes;
 156 {
 157   ...
 158 }
 159 #endif
 160
 161 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 162
 163   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 164   internal multibyte format to CODING.  The resulting unibyte text
 165   goes to a place pointed to by DESTINATION, the length of which
 166   should not exceed DST_BYTES.
 167
 168   These functions set the information about original and encoded texts
 169   in the members `produced', `produced_char', `consumed', and
 170   `consumed_char' of the structure *CODING.  They also set the member
 171   `result' to one of CODING_FINISH_XXX indicating how the encoding
 172   finished.
 173
 174   DST_BYTES zero means that the source area and destination area are
 175   overlapped, which means that we can produce encoded text until it
 176   reaches at the head of the not-yet-encoded source text.
 177
 178   Below is a template for these functions.  */
 179 #if 0
 180 static void
 181 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 182      struct coding_system *coding;
 183      unsigned char *source, *destination;
 184      int src_bytes, dst_bytes;
 185 {
 186   ...
 187 }
 188 #endif
 189
 190 /*** COMMONLY USED MACROS ***/
 191
 192 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 193    get one, two, and three bytes from the source text respectively.
 194    If there are not enough bytes in the source, they jump to
 195    `label_end_of_loop'.  The caller should set variables `coding',
 196    `src' and `src_end' to appropriate pointer in advance.  These
 197    macros are called from decoding routines `decode_coding_XXX', thus
 198    it is assumed that the source text is unibyte.  */
 199
 200 #define ONE_MORE_BYTE(c1)                                       \
 201   do {                                                          \
 202     if (src >= src_end)                                         \
 203       {                                                         \
 204         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 205         goto label_end_of_loop;                                 \
 206       }                                                         \
 207     c1 = *src++;                                                \
 208   } while (0)
 209
 210 #define TWO_MORE_BYTES(c1, c2)                                  \
 211   do {                                                          \
 212     if (src + 1 >= src_end)                                     \
 213       {                                                         \
 214         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 215         goto label_end_of_loop;                                 \
 216       }                                                         \
 217     c1 = *src++;                                                \
 218     c2 = *src++;                                                \
 219   } while (0)
 220
 221
 222 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 223    form if MULTIBYTEP is nonzero.  In addition, if SRC is not less
 224    than SRC_END, return with RET.  */
 225
 226 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret)      \
 227   do {                                                          \
 228     if (src >= src_end)                                         \
 229       {                                                         \
 230         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 231         return ret;                                             \
 232       }                                                         \
 233     c1 = *src++;                                                \
 234     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 235       c1 = *src++ - 0x20;                                       \
 236   } while (0)
 237
 238 /* Set C to the next character at the source text pointed by `src'.
 239    If there are not enough characters in the source, jump to
 240    `label_end_of_loop'.  The caller should set variables `coding'
 241    `src', `src_end', and `translation_table' to appropriate pointers
 242    in advance.  This macro is used in encoding routines
 243    `encode_coding_XXX', thus it assumes that the source text is in
 244    multibyte form except for 8-bit characters.  8-bit characters are
 245    in multibyte form if coding->src_multibyte is nonzero, else they
 246    are represented by a single byte.  */
 247
 248 #define ONE_MORE_CHAR(c)                                        \
 249   do {                                                          \
 250     int len = src_end - src;                                    \
 251     int bytes;                                                  \
 252     if (len <= 0)                                               \
 253       {                                                         \
 254         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 255         goto label_end_of_loop;                                 \
 256       }                                                         \
 257     if (coding->src_multibyte                                   \
 258         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 259       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 260     else                                                        \
 261       c = *src, bytes = 1;                                      \
 262     if (!NILP (translation_table))                              \
 263       c = translate_char (translation_table, c, -1, 0, 0);      \
 264     src += bytes;                                               \
 265   } while (0)
 266
 267
 268 /* Produce a multibyte form of character C to `dst'.  Jump to
 269    `label_end_of_loop' if there's not enough space at `dst'.
 270
 271    If we are now in the middle of a composition sequence, the decoded
 272    character may be ALTCHAR (for the current composition).  In that
 273    case, the character goes to coding->cmp_data->data instead of
 274    `dst'.
 275
 276    This macro is used in decoding routines.  */
 277
 278 #define EMIT_CHAR(c)                                                    \
 279   do {                                                                  \
 280     if (! COMPOSING_P (coding)                                          \
 281         || coding->composing == COMPOSITION_RELATIVE                    \
 282         || coding->composing == COMPOSITION_WITH_RULE)                  \
 283       {                                                                 \
 284         int bytes = CHAR_BYTES (c);                                     \
 285         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 286           {                                                             \
 287             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 288             goto label_end_of_loop;                                     \
 289           }                                                             \
 290         dst += CHAR_STRING (c, dst);                                    \
 291         coding->produced_char++;                                        \
 292       }                                                                 \
 293                                                                         \
 294     if (COMPOSING_P (coding)                                            \
 295         && coding->composing != COMPOSITION_RELATIVE)                   \
 296       {                                                                 \
 297         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 298         coding->composition_rule_follows                                \
 299           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 300       }                                                                 \
 301   } while (0)
 302
 303
 304 #define EMIT_ONE_BYTE(c)                                        \
 305   do {                                                          \
 306     if (dst >= (dst_bytes ? dst_end : src))                     \
 307       {                                                         \
 308         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 309         goto label_end_of_loop;                                 \
 310       }                                                         \
 311     *dst++ = c;                                                 \
 312   } while (0)
 313
 314 #define EMIT_TWO_BYTES(c1, c2)                                  \
 315   do {                                                          \
 316     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 317       {                                                         \
 318         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 319         goto label_end_of_loop;                                 \
 320       }                                                         \
 321     *dst++ = c1, *dst++ = c2;                                   \
 322   } while (0)
 323
 324 #define EMIT_BYTES(from, to)                                    \
 325   do {                                                          \
 326     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 327       {                                                         \
 328         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 329         goto label_end_of_loop;                                 \
 330       }                                                         \
 331     while (from < to)                                           \
 332       *dst++ = *from++;                                         \
 333   } while (0)
 334
 335 \f
 336 /*** 1. Preamble ***/
 337
 338 #ifdef emacs
 339 #include <config.h>
 340 #endif
 341
 342 #include <stdio.h>
 343
 344 #ifdef emacs
 345
 346 #include "lisp.h"
 347 #include "buffer.h"
 348 #include "charset.h"
 349 #include "composite.h"
 350 #include "ccl.h"
 351 #include "coding.h"
 352 #include "window.h"
 353 #include "intervals.h"
 354 #include "frame.h"
 355 #include "termhooks.h"
 356
 357 #else  /* not emacs */
 358
 359 #include "mulelib.h"
 360
 361 #endif /* not emacs */
 362
 363 Lisp_Object Qcoding_system, Qeol_type;
 364 Lisp_Object Qbuffer_file_coding_system;
 365 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 366 Lisp_Object Qno_conversion, Qundecided;
 367 Lisp_Object Qcoding_system_history;
 368 Lisp_Object Qsafe_chars;
 369 Lisp_Object Qvalid_codes;
 370 Lisp_Object Qascii_incompatible;
 371
 372 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 373 Lisp_Object Qcall_process, Qcall_process_region;
 374 Lisp_Object Qstart_process, Qopen_network_stream;
 375 Lisp_Object Qtarget_idx;
 376
 377 extern Lisp_Object Qcompletion_ignore_case;
 378
 379 /* If a symbol has this property, evaluate the value to define the
 380    symbol as a coding system.  */
 381 Lisp_Object Qcoding_system_define_form;
 382
 383 Lisp_Object Vselect_safe_coding_system_function;
 384
 385 int coding_system_require_warning;
 386
 387 /* Mnemonic string for each format of end-of-line.  */
 388 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 389 /* Mnemonic string to indicate format of end-of-line is not yet
 390    decided.  */
 391 Lisp_Object eol_mnemonic_undecided;
 392
 393 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 394    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
 395    This has an effect only for external encoding (i.e. for output to
 396    file and process), not for in-buffer or Lisp string encoding.  */
 397 int system_eol_type;
 398
 399 #ifdef emacs
 400
 401 /* Information about which coding system is safe for which chars.
 402    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 403
 404    GENERIC-LIST is a list of generic coding systems which can encode
 405    any characters.
 406
 407    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 408    corresponding char table that contains safe chars.  */
 409 Lisp_Object Vcoding_system_safe_chars;
 410
 411 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 412
 413 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 414
 415 /* Coding system emacs-mule and raw-text are for converting only
 416    end-of-line format.  */
 417 Lisp_Object Qemacs_mule, Qraw_text;
 418
 419 Lisp_Object Qutf_8;
 420
 421 /* Coding-systems are handed between Emacs Lisp programs and C internal
 422    routines by the following three variables.  */
 423 /* Coding-system for reading files and receiving data from process.  */
 424 Lisp_Object Vcoding_system_for_read;
 425 /* Coding-system for writing files and sending data to process.  */
 426 Lisp_Object Vcoding_system_for_write;
 427 /* Coding-system actually used in the latest I/O.  */
 428 Lisp_Object Vlast_coding_system_used;
 429
 430 /* A vector of length 256 which contains information about special
 431    Latin codes (especially for dealing with Microsoft codes).  */
 432 Lisp_Object Vlatin_extra_code_table;
 433
 434 /* Flag to inhibit code conversion of end-of-line format.  */
 435 int inhibit_eol_conversion;
 436
 437 /* Flag to inhibit ISO2022 escape sequence detection.  */
 438 int inhibit_iso_escape_detection;
 439
 440 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 441 int inherit_process_coding_system;
 442
 443 /* Coding system to be used to encode text for terminal display when
 444    terminal coding system is nil.  */
 445 struct coding_system safe_terminal_coding;
 446
 447 /* Default coding system to be used to write a file.  */
 448 struct coding_system default_buffer_file_coding;
 449
 450 Lisp_Object Vfile_coding_system_alist;
 451 Lisp_Object Vprocess_coding_system_alist;
 452 Lisp_Object Vnetwork_coding_system_alist;
 453
 454 Lisp_Object Vlocale_coding_system;
 455
 456 #endif /* emacs */
 457
 458 Lisp_Object Qcoding_category, Qcoding_category_index;
 459
 460 /* List of symbols `coding-category-xxx' ordered by priority.  */
 461 Lisp_Object Vcoding_category_list;
 462
 463 /* Table of coding categories (Lisp symbols).  */
 464 Lisp_Object Vcoding_category_table;
 465
 466 /* Table of names of symbol for each coding-category.  */
 467 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 468   "coding-category-emacs-mule",
 469   "coding-category-sjis",
 470   "coding-category-iso-7",
 471   "coding-category-iso-7-tight",
 472   "coding-category-iso-8-1",
 473   "coding-category-iso-8-2",
 474   "coding-category-iso-7-else",
 475   "coding-category-iso-8-else",
 476   "coding-category-ccl",
 477   "coding-category-big5",
 478   "coding-category-utf-8",
 479   "coding-category-utf-16-be",
 480   "coding-category-utf-16-le",
 481   "coding-category-raw-text",
 482   "coding-category-binary"
 483 };
 484
 485 /* Table of pointers to coding systems corresponding to each coding
 486    categories.  */
 487 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 488
 489 /* Table of coding category masks.  Nth element is a mask for a coding
 490    category of which priority is Nth.  */
 491 static
 492 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 493
 494 /* Flag to tell if we look up translation table on character code
 495    conversion.  */
 496 Lisp_Object Venable_character_translation;
 497 /* Standard translation table to look up on decoding (reading).  */
 498 Lisp_Object Vstandard_translation_table_for_decode;
 499 /* Standard translation table to look up on encoding (writing).  */
 500 Lisp_Object Vstandard_translation_table_for_encode;
 501
 502 Lisp_Object Qtranslation_table;
 503 Lisp_Object Qtranslation_table_id;
 504 Lisp_Object Qtranslation_table_for_decode;
 505 Lisp_Object Qtranslation_table_for_encode;
 506
 507 /* Alist of charsets vs revision number.  */
 508 Lisp_Object Vcharset_revision_alist;
 509
 510 /* Default coding systems used for process I/O.  */
 511 Lisp_Object Vdefault_process_coding_system;
 512
 513 /* Char table for translating Quail and self-inserting input.  */
 514 Lisp_Object Vtranslation_table_for_input;
 515
 516 /* Global flag to tell that we can't call post-read-conversion and
 517    pre-write-conversion functions.  Usually the value is zero, but it
 518    is set to 1 temporarily while such functions are running.  This is
 519    to avoid infinite recursive call.  */
 520 static int inhibit_pre_post_conversion;
 521
 522 Lisp_Object Qchar_coding_system;
 523
 524 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 525    its validity.  */
 526
 527 Lisp_Object
 528 coding_safe_chars (coding_system)
 529      Lisp_Object coding_system;
 530 {
 531   Lisp_Object coding_spec, plist, safe_chars;
 532
 533   coding_spec = Fget (coding_system, Qcoding_system);
 534   plist = XVECTOR (coding_spec)->contents[3];
 535   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 536   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 537 }
 538
 539 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 540   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 541
 542 \f
 543 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 544
 545 /* Emacs' internal format for representation of multiple character
 546    sets is a kind of multi-byte encoding, i.e. characters are
 547    represented by variable-length sequences of one-byte codes.
 548
 549    ASCII characters and control characters (e.g. `tab', `newline') are
 550    represented by one-byte sequences which are their ASCII codes, in
 551    the range 0x00 through 0x7F.
 552
 553    8-bit characters of the range 0x80..0x9F are represented by
 554    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 555    code + 0x20).
 556
 557    8-bit characters of the range 0xA0..0xFF are represented by
 558    one-byte sequences which are their 8-bit code.
 559
 560    The other characters are represented by a sequence of `base
 561    leading-code', optional `extended leading-code', and one or two
 562    `position-code's.  The length of the sequence is determined by the
 563    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 564    whereas extended leading-code and position-code take the range 0xA0
 565    through 0xFF.  See `charset.h' for more details about leading-code
 566    and position-code.
 567
 568    --- CODE RANGE of Emacs' internal format ---
 569    character set        range
 570    -------------        -----
 571    ascii                0x00..0x7F
 572    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 573    eight-bit-graphic    0xA0..0xBF
 574    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 575    ---------------------------------------------
 576
 577    As this is the internal character representation, the format is
 578    usually not used externally (i.e. in a file or in a data sent to a
 579    process).  But, it is possible to have a text externally in this
 580    format (i.e. by encoding by the coding system `emacs-mule').
 581
 582    In that case, a sequence of one-byte codes has a slightly different
 583    form.
 584
 585    Firstly, all characters in eight-bit-control are represented by
 586    one-byte sequences which are their 8-bit code.
 587
 588    Next, character composition data are represented by the byte
 589    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 590    where,
 591         METHOD is 0xF0 plus one of composition method (enum
 592         composition_method),
 593
 594         BYTES is 0xA0 plus the byte length of these composition data,
 595
 596         CHARS is 0xA0 plus the number of characters composed by these
 597         data,
 598
 599         COMPONENTs are characters of multibyte form or composition
 600         rules encoded by two-byte of ASCII codes.
 601
 602    In addition, for backward compatibility, the following formats are
 603    also recognized as composition data on decoding.
 604
 605    0x80 MSEQ ...
 606    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 607
 608    Here,
 609         MSEQ is a multibyte form but in these special format:
 610           ASCII: 0xA0 ASCII_CODE+0x80,
 611           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 612         RULE is a one byte code of the range 0xA0..0xF0 that
 613         represents a composition rule.
 614   */
 615
 616 enum emacs_code_class_type emacs_code_class[256];
 617
 618 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 619    Check if a text is encoded in Emacs' internal format.  If it is,
 620    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 621
 622 static int
 623 detect_coding_emacs_mule (src, src_end, multibytep)
 624       unsigned char *src, *src_end;
 625       int multibytep;
 626 {
 627   unsigned char c;
 628   int composing = 0;
 629   /* Dummy for ONE_MORE_BYTE.  */
 630   struct coding_system dummy_coding;
 631   struct coding_system *coding = &dummy_coding;
 632
 633   while (1)
 634     {
 635       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
 636                                      CODING_CATEGORY_MASK_EMACS_MULE);
 637       if (composing)
 638         {
 639           if (c < 0xA0)
 640             composing = 0;
 641           else if (c == 0xA0)
 642             {
 643               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
 644               c &= 0x7F;
 645             }
 646           else
 647             c -= 0x20;
 648         }
 649
 650       if (c < 0x20)
 651         {
 652           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 653             return 0;
 654         }
 655       else if (c >= 0x80 && c < 0xA0)
 656         {
 657           if (c == 0x80)
 658             /* Old leading code for a composite character.  */
 659             composing = 1;
 660           else
 661             {
 662               unsigned char *src_base = src - 1;
 663               int bytes;
 664
 665               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 666                                                bytes))
 667                 return 0;
 668               src = src_base + bytes;
 669             }
 670         }
 671     }
 672 }
 673
 674
 675 /* Record the starting position START and METHOD of one composition.  */
 676
 677 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 678   do {                                                          \
 679     struct composition_data *cmp_data = coding->cmp_data;       \
 680     int *data = cmp_data->data + cmp_data->used;                \
 681     coding->cmp_data_start = cmp_data->used;                    \
 682     data[0] = -1;                                               \
 683     data[1] = cmp_data->char_offset + start;                    \
 684     data[3] = (int) method;                                     \
 685     cmp_data->used += 4;                                        \
 686   } while (0)
 687
 688 /* Record the ending position END of the current composition.  */
 689
 690 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 691   do {                                                          \
 692     struct composition_data *cmp_data = coding->cmp_data;       \
 693     int *data = cmp_data->data + coding->cmp_data_start;        \
 694     data[0] = cmp_data->used - coding->cmp_data_start;          \
 695     data[2] = cmp_data->char_offset + end;                      \
 696   } while (0)
 697
 698 /* Record one COMPONENT (alternate character or composition rule).  */
 699
 700 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 701   do {                                                                  \
 702     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 703     if (coding->cmp_data->used - coding->cmp_data_start                 \
 704         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 705       {                                                                 \
 706         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 707         coding->composing = COMPOSITION_NO;                             \
 708       }                                                                 \
 709   } while (0)
 710
 711
 712 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 713    is not less than SRC_END, return -1 without incrementing Src.  */
 714
 715 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 716
 717
 718 /* Decode a character represented as a component of composition
 719    sequence of Emacs 20 style at SRC.  Set C to that character, store
 720    its multibyte form sequence at P, and set P to the end of that
 721    sequence.  If no valid character is found, set C to -1.  */
 722
 723 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 724   do {                                                          \
 725     int bytes;                                                  \
 726                                                                 \
 727     c = SAFE_ONE_MORE_BYTE ();                                  \
 728     if (c < 0)                                                  \
 729       break;                                                    \
 730     if (CHAR_HEAD_P (c))                                        \
 731       c = -1;                                                   \
 732     else if (c == 0xA0)                                         \
 733       {                                                         \
 734         c = SAFE_ONE_MORE_BYTE ();                              \
 735         if (c < 0xA0)                                           \
 736           c = -1;                                               \
 737         else                                                    \
 738           {                                                     \
 739             c -= 0x80;                                          \
 740             *p++ = c;                                           \
 741           }                                                     \
 742       }                                                         \
 743     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 744       {                                                         \
 745         unsigned char *p0 = p;                                  \
 746                                                                 \
 747         c -= 0x20;                                              \
 748         *p++ = c;                                               \
 749         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 750         while (--bytes)                                         \
 751           {                                                     \
 752             c = SAFE_ONE_MORE_BYTE ();                          \
 753             if (c < 0)                                          \
 754               break;                                            \
 755             *p++ = c;                                           \
 756           }                                                     \
 757         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 758             || (coding->flags /* We are recovering a file.  */  \
 759                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 760                 && ! CHAR_HEAD_P (p0[1])))                      \
 761           c = STRING_CHAR (p0, bytes);                          \
 762         else                                                    \
 763           c = -1;                                               \
 764       }                                                         \
 765     else                                                        \
 766       c = -1;                                                   \
 767   } while (0)
 768
 769
 770 /* Decode a composition rule represented as a component of composition
 771    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 772    valid rule is found, set C to -1.  */
 773
 774 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 775   do {                                                  \
 776     c = SAFE_ONE_MORE_BYTE ();                          \
 777     c -= 0xA0;                                          \
 778     if (c < 0 || c >= 81)                               \
 779       c = -1;                                           \
 780     else                                                \
 781       {                                                 \
 782         gref = c / 9, nref = c % 9;                     \
 783         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 784       }                                                 \
 785   } while (0)
 786
 787
 788 /* Decode composition sequence encoded by `emacs-mule' at the source
 789    pointed by SRC.  SRC_END is the end of source.  Store information
 790    of the composition in CODING->cmp_data.
 791
 792    For backward compatibility, decode also a composition sequence of
 793    Emacs 20 style.  In that case, the composition sequence contains
 794    characters that should be extracted into a buffer or string.  Store
 795    those characters at *DESTINATION in multibyte form.
 796
 797    If we encounter an invalid byte sequence, return 0.
 798    If we encounter an insufficient source or destination, or
 799    insufficient space in CODING->cmp_data, return 1.
 800    Otherwise, return consumed bytes in the source.
 801
 802 */
 803 static INLINE int
 804 decode_composition_emacs_mule (coding, src, src_end,
 805                                destination, dst_end, dst_bytes)
 806      struct coding_system *coding;
 807      const unsigned char *src, *src_end;
 808      unsigned char **destination, *dst_end;
 809      int dst_bytes;
 810 {
 811   unsigned char *dst = *destination;
 812   int method, data_len, nchars;
 813   const unsigned char *src_base = src++;
 814   /* Store components of composition.  */
 815   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 816   int ncomponent;
 817   /* Store multibyte form of characters to be composed.  This is for
 818      Emacs 20 style composition sequence.  */
 819   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 820   unsigned char *bufp = buf;
 821   int c, i, gref, nref;
 822
 823   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 824       >= COMPOSITION_DATA_SIZE)
 825     {
 826       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 827       return -1;
 828     }
 829
 830   ONE_MORE_BYTE (c);
 831   if (c - 0xF0 >= COMPOSITION_RELATIVE
 832            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 833     {
 834       int with_rule;
 835
 836       method = c - 0xF0;
 837       with_rule = (method == COMPOSITION_WITH_RULE
 838                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 839       ONE_MORE_BYTE (c);
 840       data_len = c - 0xA0;
 841       if (data_len < 4
 842           || src_base + data_len > src_end)
 843         return 0;
 844       ONE_MORE_BYTE (c);
 845       nchars = c - 0xA0;
 846       if (c < 1)
 847         return 0;
 848       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 849         {
 850           /* If it is longer than this, it can't be valid.  */
 851           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 852             return 0;
 853
 854           if (ncomponent % 2 && with_rule)
 855             {
 856               ONE_MORE_BYTE (gref);
 857               gref -= 32;
 858               ONE_MORE_BYTE (nref);
 859               nref -= 32;
 860               c = COMPOSITION_ENCODE_RULE (gref, nref);
 861             }
 862           else
 863             {
 864               int bytes;
 865               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 866                   || (coding->flags /* We are recovering a file.  */
 867                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 868                       && ! CHAR_HEAD_P (src[1])))
 869                 c = STRING_CHAR (src, bytes);
 870               else
 871                 c = *src, bytes = 1;
 872               src += bytes;
 873             }
 874           component[ncomponent] = c;
 875         }
 876     }
 877   else if (c >= 0x80)
 878     {
 879       /* This may be an old Emacs 20 style format.  See the comment at
 880          the section 2 of this file.  */
 881       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 882       if (src == src_end
 883           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 884         goto label_end_of_loop;
 885
 886       src_end = src;
 887       src = src_base + 1;
 888       if (c < 0xC0)
 889         {
 890           method = COMPOSITION_RELATIVE;
 891           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 892             {
 893               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 894               if (c < 0)
 895                 break;
 896               component[ncomponent++] = c;
 897             }
 898           if (ncomponent < 2)
 899             return 0;
 900           nchars = ncomponent;
 901         }
 902       else if (c == 0xFF)
 903         {
 904           method = COMPOSITION_WITH_RULE;
 905           src++;
 906           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 907           if (c < 0)
 908             return 0;
 909           component[0] = c;
 910           for (ncomponent = 1;
 911                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 912             {
 913               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 914               if (c < 0)
 915                 break;
 916               component[ncomponent++] = c;
 917               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 918               if (c < 0)
 919                 break;
 920               component[ncomponent++] = c;
 921             }
 922           if (ncomponent < 3)
 923             return 0;
 924           nchars = (ncomponent + 1) / 2;
 925         }
 926       else
 927         return 0;
 928     }
 929   else
 930     return 0;
 931
 932   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 933     {
 934       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 935       for (i = 0; i < ncomponent; i++)
 936         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 937       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 938       if (buf < bufp)
 939         {
 940           unsigned char *p = buf;
 941           EMIT_BYTES (p, bufp);
 942           *destination += bufp - buf;
 943           coding->produced_char += nchars;
 944         }
 945       return (src - src_base);
 946     }
 947  label_end_of_loop:
 948   return -1;
 949 }
 950
 951 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 952
 953 static void
 954 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 955      struct coding_system *coding;
 956      const unsigned char *source;
 957      unsigned char *destination;
 958      int src_bytes, dst_bytes;
 959 {
 960   const unsigned char *src = source;
 961   const unsigned char *src_end = source + src_bytes;
 962   unsigned char *dst = destination;
 963   unsigned char *dst_end = destination + dst_bytes;
 964   /* SRC_BASE remembers the start position in source in each loop.
 965      The loop will be exited when there's not enough source code, or
 966      when there's not enough destination area to produce a
 967      character.  */
 968   const unsigned char *src_base;
 969
 970   coding->produced_char = 0;
 971   while ((src_base = src) < src_end)
 972     {
 973       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 974       const unsigned char *p;
 975       int bytes;
 976
 977       if (*src == '\r')
 978         {
 979           int c = *src++;
 980
 981           if (coding->eol_type == CODING_EOL_CR)
 982             c = '\n';
 983           else if (coding->eol_type == CODING_EOL_CRLF)
 984             {
 985               ONE_MORE_BYTE (c);
 986               if (c != '\n')
 987                 {
 988                   src--;
 989                   c = '\r';
 990                 }
 991             }
 992           *dst++ = c;
 993           coding->produced_char++;
 994           continue;
 995         }
 996       else if (*src == '\n')
 997         {
 998           if ((coding->eol_type == CODING_EOL_CR
 999                || coding->eol_type == CODING_EOL_CRLF)
1000               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1001             {
1002               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1003               goto label_end_of_loop;
1004             }
1005           *dst++ = *src++;
1006           coding->produced_char++;
1007           continue;
1008         }
1009       else if (*src == 0x80 && coding->cmp_data)
1010         {
1011           /* Start of composition data.  */
1012           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1013                                                          &dst, dst_end,
1014                                                          dst_bytes);
1015           if (consumed < 0)
1016             goto label_end_of_loop;
1017           else if (consumed > 0)
1018             {
1019               src += consumed;
1020               continue;
1021             }
1022           bytes = CHAR_STRING (*src, tmp);
1023           p = tmp;
1024           src++;
1025         }
1026       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1027                || (coding->flags /* We are recovering a file.  */
1028                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1029                    && ! CHAR_HEAD_P (src[1])))
1030         {
1031           p = src;
1032           src += bytes;
1033         }
1034       else
1035         {
1036           int i, c;
1037
1038           bytes = BYTES_BY_CHAR_HEAD (*src);
1039           src++;
1040           for (i = 1; i < bytes; i++)
1041             {
1042               ONE_MORE_BYTE (c);
1043               if (CHAR_HEAD_P (c))
1044                 break;
1045             }
1046           if (i < bytes)
1047             {
1048               bytes = CHAR_STRING (*src_base, tmp);
1049               p = tmp;
1050               src = src_base + 1;
1051             }
1052           else
1053             {
1054               p = src_base;
1055             }
1056         }
1057       if (dst + bytes >= (dst_bytes ? dst_end : src))
1058         {
1059           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1060           break;
1061         }
1062       while (bytes--) *dst++ = *p++;
1063       coding->produced_char++;
1064     }
1065  label_end_of_loop:
1066   coding->consumed = coding->consumed_char = src_base - source;
1067   coding->produced = dst - destination;
1068 }
1069
1070
1071 /* Encode composition data stored at DATA into a special byte sequence
1072    starting by 0x80.  Update CODING->cmp_data_start and maybe
1073    CODING->cmp_data for the next call.  */
1074
1075 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1076   do {                                                                  \
1077     unsigned char buf[1024], *p0 = buf, *p;                             \
1078     int len = data[0];                                                  \
1079     int i;                                                              \
1080                                                                         \
1081     buf[0] = 0x80;                                                      \
1082     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1083     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1084     p = buf + 4;                                                        \
1085     if (data[3] == COMPOSITION_WITH_RULE                                \
1086         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1087       {                                                                 \
1088         p += CHAR_STRING (data[4], p);                                  \
1089         for (i = 5; i < len; i += 2)                                    \
1090           {                                                             \
1091             int gref, nref;                                             \
1092              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1093             *p++ = 0x20 + gref;                                         \
1094             *p++ = 0x20 + nref;                                         \
1095             p += CHAR_STRING (data[i + 1], p);                          \
1096           }                                                             \
1097       }                                                                 \
1098     else                                                                \
1099       {                                                                 \
1100         for (i = 4; i < len; i++)                                       \
1101           p += CHAR_STRING (data[i], p);                                \
1102       }                                                                 \
1103     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1104                                                                         \
1105     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1106       {                                                                 \
1107         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1108         goto label_end_of_loop;                                         \
1109       }                                                                 \
1110     while (p0 < p)                                                      \
1111       *dst++ = *p0++;                                                   \
1112     coding->cmp_data_start += data[0];                                  \
1113     if (coding->cmp_data_start == coding->cmp_data->used                \
1114         && coding->cmp_data->next)                                      \
1115       {                                                                 \
1116         coding->cmp_data = coding->cmp_data->next;                      \
1117         coding->cmp_data_start = 0;                                     \
1118       }                                                                 \
1119   } while (0)
1120
1121
1122 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1123                             unsigned char *, int, int));
1124
1125 static void
1126 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1127      struct coding_system *coding;
1128      const unsigned char *source;
1129      unsigned char *destination;
1130      int src_bytes, dst_bytes;
1131 {
1132   const unsigned char *src = source;
1133   const unsigned char *src_end = source + src_bytes;
1134   unsigned char *dst = destination;
1135   unsigned char *dst_end = destination + dst_bytes;
1136   const unsigned char *src_base;
1137   int c;
1138   int char_offset;
1139   int *data;
1140
1141   Lisp_Object translation_table;
1142
1143   translation_table = Qnil;
1144
1145   /* Optimization for the case that there's no composition.  */
1146   if (!coding->cmp_data || coding->cmp_data->used == 0)
1147     {
1148       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1149       return;
1150     }
1151
1152   char_offset = coding->cmp_data->char_offset;
1153   data = coding->cmp_data->data + coding->cmp_data_start;
1154   while (1)
1155     {
1156       src_base = src;
1157
1158       /* If SRC starts a composition, encode the information about the
1159          composition in advance.  */
1160       if (coding->cmp_data_start < coding->cmp_data->used
1161           && char_offset + coding->consumed_char == data[1])
1162         {
1163           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1164           char_offset = coding->cmp_data->char_offset;
1165           data = coding->cmp_data->data + coding->cmp_data_start;
1166         }
1167
1168       ONE_MORE_CHAR (c);
1169       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1170                         || coding->eol_type == CODING_EOL_CR))
1171         {
1172           if (coding->eol_type == CODING_EOL_CRLF)
1173             EMIT_TWO_BYTES ('\r', c);
1174           else
1175             EMIT_ONE_BYTE ('\r');
1176         }
1177       else if (SINGLE_BYTE_CHAR_P (c))
1178         {
1179           if (coding->flags && ! ASCII_BYTE_P (c))
1180             {
1181               /* As we are auto saving, retain the multibyte form for
1182                  8-bit chars.  */
1183               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1184               int bytes = CHAR_STRING (c, buf);
1185
1186               if (bytes == 1)
1187                 EMIT_ONE_BYTE (buf[0]);
1188               else
1189                 EMIT_TWO_BYTES (buf[0], buf[1]);
1190             }
1191           else
1192             EMIT_ONE_BYTE (c);
1193         }
1194       else
1195         EMIT_BYTES (src_base, src);
1196       coding->consumed_char++;
1197     }
1198  label_end_of_loop:
1199   coding->consumed = src_base - source;
1200   coding->produced = coding->produced_char = dst - destination;
1201   return;
1202 }
1203
1204 \f
1205 /*** 3. ISO2022 handlers ***/
1206
1207 /* The following note describes the coding system ISO2022 briefly.
1208    Since the intention of this note is to help understand the
1209    functions in this file, some parts are NOT ACCURATE or are OVERLY
1210    SIMPLIFIED.  For thorough understanding, please refer to the
1211    original document of ISO2022.  This is equivalent to the standard
1212    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1213
1214    ISO2022 provides many mechanisms to encode several character sets
1215    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1216    is encoded using bytes less than 128.  This may make the encoded
1217    text a little bit longer, but the text passes more easily through
1218    several types of gateway, some of which strip off the MSB (Most
1219    Significant Bit).
1220
1221    There are two kinds of character sets: control character sets and
1222    graphic character sets.  The former contain control characters such
1223    as `newline' and `escape' to provide control functions (control
1224    functions are also provided by escape sequences).  The latter
1225    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1226    two control character sets and many graphic character sets.
1227
1228    Graphic character sets are classified into one of the following
1229    four classes, according to the number of bytes (DIMENSION) and
1230    number of characters in one dimension (CHARS) of the set:
1231    - DIMENSION1_CHARS94
1232    - DIMENSION1_CHARS96
1233    - DIMENSION2_CHARS94
1234    - DIMENSION2_CHARS96
1235
1236    In addition, each character set is assigned an identification tag,
1237    unique for each set, called the "final character" (denoted as <F>
1238    hereafter).  The <F> of each character set is decided by ECMA(*)
1239    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1240    (0x30..0x3F are for private use only).
1241
1242    Note (*): ECMA = European Computer Manufacturers Association
1243
1244    Here are examples of graphic character sets [NAME(<F>)]:
1245         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1246         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1247         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1248         o DIMENSION2_CHARS96 -- none for the moment
1249
1250    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1251         C0 [0x00..0x1F] -- control character plane 0
1252         GL [0x20..0x7F] -- graphic character plane 0
1253         C1 [0x80..0x9F] -- control character plane 1
1254         GR [0xA0..0xFF] -- graphic character plane 1
1255
1256    A control character set is directly designated and invoked to C0 or
1257    C1 by an escape sequence.  The most common case is that:
1258    - ISO646's  control character set is designated/invoked to C0, and
1259    - ISO6429's control character set is designated/invoked to C1,
1260    and usually these designations/invocations are omitted in encoded
1261    text.  In a 7-bit environment, only C0 can be used, and a control
1262    character for C1 is encoded by an appropriate escape sequence to
1263    fit into the environment.  All control characters for C1 are
1264    defined to have corresponding escape sequences.
1265
1266    A graphic character set is at first designated to one of four
1267    graphic registers (G0 through G3), then these graphic registers are
1268    invoked to GL or GR.  These designations and invocations can be
1269    done independently.  The most common case is that G0 is invoked to
1270    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1271    these invocations and designations are omitted in encoded text.
1272    In a 7-bit environment, only GL can be used.
1273
1274    When a graphic character set of CHARS94 is invoked to GL, codes
1275    0x20 and 0x7F of the GL area work as control characters SPACE and
1276    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1277    be used.
1278
1279    There are two ways of invocation: locking-shift and single-shift.
1280    With locking-shift, the invocation lasts until the next different
1281    invocation, whereas with single-shift, the invocation affects the
1282    following character only and doesn't affect the locking-shift
1283    state.  Invocations are done by the following control characters or
1284    escape sequences:
1285
1286    ----------------------------------------------------------------------
1287    abbrev  function                  cntrl escape seq   description
1288    ----------------------------------------------------------------------
1289    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1290    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1291    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1292    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1293    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1294    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1295    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1296    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1297    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1298    ----------------------------------------------------------------------
1299    (*) These are not used by any known coding system.
1300
1301    Control characters for these functions are defined by macros
1302    ISO_CODE_XXX in `coding.h'.
1303
1304    Designations are done by the following escape sequences:
1305    ----------------------------------------------------------------------
1306    escape sequence      description
1307    ----------------------------------------------------------------------
1308    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1309    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1310    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1311    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1312    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1313    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1314    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1315    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1316    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1317    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1318    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1319    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1320    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1321    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1322    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1323    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1324    ----------------------------------------------------------------------
1325
1326    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1327    of dimension 1, chars 94, and final character <F>, etc...
1328
1329    Note (*): Although these designations are not allowed in ISO2022,
1330    Emacs accepts them on decoding, and produces them on encoding
1331    CHARS96 character sets in a coding system which is characterized as
1332    7-bit environment, non-locking-shift, and non-single-shift.
1333
1334    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1335    '(' can be omitted.  We refer to this as "short-form" hereafter.
1336
1337    Now you may notice that there are a lot of ways of encoding the
1338    same multilingual text in ISO2022.  Actually, there exist many
1339    coding systems such as Compound Text (used in X11's inter client
1340    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1341    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1342    localized platforms), and all of these are variants of ISO2022.
1343
1344    In addition to the above, Emacs handles two more kinds of escape
1345    sequences: ISO6429's direction specification and Emacs' private
1346    sequence for specifying character composition.
1347
1348    ISO6429's direction specification takes the following form:
1349         o CSI ']'      -- end of the current direction
1350         o CSI '0' ']'  -- end of the current direction
1351         o CSI '1' ']'  -- start of left-to-right text
1352         o CSI '2' ']'  -- start of right-to-left text
1353    The control character CSI (0x9B: control sequence introducer) is
1354    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1355
1356    Character composition specification takes the following form:
1357         o ESC '0' -- start relative composition
1358         o ESC '1' -- end composition
1359         o ESC '2' -- start rule-base composition (*)
1360         o ESC '3' -- start relative composition with alternate chars  (**)
1361         o ESC '4' -- start rule-base composition with alternate chars  (**)
1362   Since these are not standard escape sequences of any ISO standard,
1363   the use of them with these meanings is restricted to Emacs only.
1364
1365   (*) This form is used only in Emacs 20.5 and older versions,
1366   but the newer versions can safely decode it.
1367   (**) This form is used only in Emacs 21.1 and newer versions,
1368   and the older versions can't decode it.
1369
1370   Here's a list of example usages of these composition escape
1371   sequences (categorized by `enum composition_method').
1372
1373   COMPOSITION_RELATIVE:
1374         ESC 0 CHAR [ CHAR ] ESC 1
1375   COMPOSITION_WITH_RULE:
1376         ESC 2 CHAR [ RULE CHAR ] ESC 1
1377   COMPOSITION_WITH_ALTCHARS:
1378         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1379   COMPOSITION_WITH_RULE_ALTCHARS:
1380         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1381
1382 enum iso_code_class_type iso_code_class[256];
1383
1384 #define CHARSET_OK(idx, charset, c)                                     \
1385   (coding_system_table[idx]                                             \
1386    && (charset == CHARSET_ASCII                                         \
1387        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1388            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1389    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1390                                               charset)                  \
1391        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1392
1393 #define SHIFT_OUT_OK(idx) \
1394   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1395
1396 #define COMPOSITION_OK(idx)     \
1397   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1398
1399 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1400    Check if a text is encoded in ISO2022.  If it is, return an
1401    integer in which appropriate flag bits any of:
1402         CODING_CATEGORY_MASK_ISO_7
1403         CODING_CATEGORY_MASK_ISO_7_TIGHT
1404         CODING_CATEGORY_MASK_ISO_8_1
1405         CODING_CATEGORY_MASK_ISO_8_2
1406         CODING_CATEGORY_MASK_ISO_7_ELSE
1407         CODING_CATEGORY_MASK_ISO_8_ELSE
1408    are set.  If a code which should never appear in ISO2022 is found,
1409    returns 0.
1410
1411    If *latin_extra_code_state is zero and Latin extra codes are found,
1412    set *latin_extra_code_state to 1 and return 0.  If it is nonzero,
1413    accept Latin extra codes.  */
1414
1415 static int
1416 detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
1417      unsigned char *src, *src_end;
1418      int multibytep;
1419      int *latin_extra_code_state;
1420 {
1421   int mask = CODING_CATEGORY_MASK_ISO;
1422   int mask_found = 0;
1423   int reg[4], shift_out = 0, single_shifting = 0;
1424   int c, c1, charset;
1425   /* Dummy for ONE_MORE_BYTE.  */
1426   struct coding_system dummy_coding;
1427   struct coding_system *coding = &dummy_coding;
1428   Lisp_Object safe_chars;
1429
1430   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1431   while (mask)
1432     {
1433       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1434     retry:
1435       switch (c)
1436         {
1437         case ISO_CODE_ESC:
1438           if (inhibit_iso_escape_detection)
1439             break;
1440           single_shifting = 0;
1441           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1442           if (c >= '(' && c <= '/')
1443             {
1444               /* Designation sequence for a charset of dimension 1.  */
1445               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1446               if (c1 < ' ' || c1 >= 0x80
1447                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1448                 /* Invalid designation sequence.  Just ignore.  */
1449                 break;
1450               reg[(c - '(') % 4] = charset;
1451             }
1452           else if (c == '$')
1453             {
1454               /* Designation sequence for a charset of dimension 2.  */
1455               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1456               if (c >= '@' && c <= 'B')
1457                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1458                 reg[0] = charset = iso_charset_table[1][0][c];
1459               else if (c >= '(' && c <= '/')
1460                 {
1461                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1462                                                  mask & mask_found);
1463                   if (c1 < ' ' || c1 >= 0x80
1464                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1465                     /* Invalid designation sequence.  Just ignore.  */
1466                     break;
1467                   reg[(c - '(') % 4] = charset;
1468                 }
1469               else
1470                 /* Invalid designation sequence.  Just ignore.  */
1471                 break;
1472             }
1473           else if (c == 'N' || c == 'O')
1474             {
1475               /* ESC <Fe> for SS2 or SS3.  */
1476               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1477               break;
1478             }
1479           else if (c >= '0' && c <= '4')
1480             {
1481               /* ESC <Fp> for start/end composition.  */
1482               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1483                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1484               else
1485                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1486               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1487                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1488               else
1489                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1490               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1491                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1492               else
1493                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1494               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1495                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1496               else
1497                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1498               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1499                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1500               else
1501                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1502               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1503                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1504               else
1505                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1506               break;
1507             }
1508           else
1509             /* Invalid escape sequence.  Just ignore.  */
1510             break;
1511
1512           /* We found a valid designation sequence for CHARSET.  */
1513           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1514           c = MAKE_CHAR (charset, 0, 0);
1515           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1516             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1517           else
1518             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1519           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1520             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1521           else
1522             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1523           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1524             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1525           else
1526             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1527           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1528             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1529           else
1530             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1531           break;
1532
1533         case ISO_CODE_SO:
1534           if (inhibit_iso_escape_detection)
1535             break;
1536           single_shifting = 0;
1537           if (shift_out == 0
1538               && (reg[1] >= 0
1539                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1540                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1541             {
1542               /* Locking shift out.  */
1543               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1544               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1545             }
1546           break;
1547
1548         case ISO_CODE_SI:
1549           if (inhibit_iso_escape_detection)
1550             break;
1551           single_shifting = 0;
1552           if (shift_out == 1)
1553             {
1554               /* Locking shift in.  */
1555               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1556               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1557             }
1558           break;
1559
1560         case ISO_CODE_CSI:
1561           single_shifting = 0;
1562         case ISO_CODE_SS2:
1563         case ISO_CODE_SS3:
1564           {
1565             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1566
1567             if (inhibit_iso_escape_detection)
1568               break;
1569             if (c != ISO_CODE_CSI)
1570               {
1571                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1573                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1576                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577                 single_shifting = 1;
1578               }
1579             if (VECTORP (Vlatin_extra_code_table)
1580                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1581               {
1582                 if (! *latin_extra_code_state)
1583                   {
1584                     *latin_extra_code_state = 1;
1585                     return 0;
1586                   }
1587                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1588                     & CODING_FLAG_ISO_LATIN_EXTRA)
1589                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1590                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1591                     & CODING_FLAG_ISO_LATIN_EXTRA)
1592                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1593               }
1594             mask &= newmask;
1595             mask_found |= newmask;
1596           }
1597           break;
1598
1599         default:
1600           if (c < 0x80)
1601             {
1602               single_shifting = 0;
1603               break;
1604             }
1605           else if (c < 0xA0)
1606             {
1607               single_shifting = 0;
1608               if (VECTORP (Vlatin_extra_code_table)
1609                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1610                 {
1611                   int newmask = 0;
1612
1613                   if (! *latin_extra_code_state)
1614                     {
1615                       *latin_extra_code_state = 1;
1616                       return 0;
1617                     }
1618                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1619                       & CODING_FLAG_ISO_LATIN_EXTRA)
1620                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1621                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1622                       & CODING_FLAG_ISO_LATIN_EXTRA)
1623                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1624                   mask &= newmask;
1625                   mask_found |= newmask;
1626                 }
1627               else
1628                 return 0;
1629             }
1630           else
1631             {
1632               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1633                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1634               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1635               /* Check the length of succeeding codes of the range
1636                  0xA0..0FF.  If the byte length is odd, we exclude
1637                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1638                  when we are not single shifting.  */
1639               if (!single_shifting
1640                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1641                 {
1642                   int i = 1;
1643
1644                   c = -1;
1645                   while (src < src_end)
1646                     {
1647                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1648                                                      mask & mask_found);
1649                       if (c < 0xA0)
1650                         break;
1651                       i++;
1652                     }
1653
1654                   if (i & 1 && src < src_end)
1655                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1656                   else
1657                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1658                   if (c >= 0)
1659                     /* This means that we have read one extra byte.  */
1660                     goto retry;
1661                 }
1662             }
1663           break;
1664         }
1665     }
1666   return (mask & mask_found);
1667 }
1668
1669 /* Decode a character of which charset is CHARSET, the 1st position
1670    code is C1, the 2nd position code is C2, and return the decoded
1671    character code.  If the variable `translation_table' is non-nil,
1672    returned the translated code.  */
1673
1674 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1675   (NILP (translation_table)                     \
1676    ? MAKE_CHAR (charset, c1, c2)                \
1677    : translate_char (translation_table, -1, charset, c1, c2))
1678
1679 /* Set designation state into CODING.  */
1680 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1681   do {                                                                     \
1682     int charset, c;                                                        \
1683                                                                            \
1684     if (final_char < '0' || final_char >= 128)                             \
1685       goto label_invalid_code;                                             \
1686     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1687                                  make_number (chars),                      \
1688                                  make_number (final_char));                \
1689     c = MAKE_CHAR (charset, 0, 0);                                         \
1690     if (charset >= 0                                                       \
1691         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1692             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1693       {                                                                    \
1694         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1695             && reg == 0                                                    \
1696             && charset == CHARSET_ASCII)                                   \
1697           {                                                                \
1698             /* We should insert this designation sequence as is so         \
1699                that it is surely written back to a file.  */               \
1700             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1701             goto label_invalid_code;                                       \
1702           }                                                                \
1703         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1704         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1705             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1706           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1707         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1708       }                                                                    \
1709     else                                                                   \
1710       {                                                                    \
1711         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1712         goto label_invalid_code;                                           \
1713       }                                                                    \
1714   } while (0)
1715
1716 /* Allocate a memory block for storing information about compositions.
1717    The block is chained to the already allocated blocks.  */
1718
1719 void
1720 coding_allocate_composition_data (coding, char_offset)
1721      struct coding_system *coding;
1722      int char_offset;
1723 {
1724   struct composition_data *cmp_data
1725     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1726
1727   cmp_data->char_offset = char_offset;
1728   cmp_data->used = 0;
1729   cmp_data->prev = coding->cmp_data;
1730   cmp_data->next = NULL;
1731   if (coding->cmp_data)
1732     coding->cmp_data->next = cmp_data;
1733   coding->cmp_data = cmp_data;
1734   coding->cmp_data_start = 0;
1735   coding->composing = COMPOSITION_NO;
1736 }
1737
1738 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1739    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1740    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1741    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1742    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1743   */
1744
1745 #define DECODE_COMPOSITION_START(c1)                                       \
1746   do {                                                                     \
1747     if (coding->composing == COMPOSITION_DISABLED)                         \
1748       {                                                                    \
1749         *dst++ = ISO_CODE_ESC;                                             \
1750         *dst++ = c1 & 0x7f;                                                \
1751         coding->produced_char += 2;                                        \
1752       }                                                                    \
1753     else if (!COMPOSING_P (coding))                                        \
1754       {                                                                    \
1755         /* This is surely the start of a composition.  We must be sure     \
1756            that coding->cmp_data has enough space to store the             \
1757            information about the composition.  If not, terminate the       \
1758            current decoding loop, allocate one more memory block for       \
1759            coding->cmp_data in the caller, then start the decoding         \
1760            loop again.  We can't allocate memory here directly because     \
1761            it may cause buffer/string relocation.  */                      \
1762         if (!coding->cmp_data                                              \
1763             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1764                 >= COMPOSITION_DATA_SIZE))                                 \
1765           {                                                                \
1766             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1767             goto label_end_of_loop;                                        \
1768           }                                                                \
1769         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1770                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1771                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1772                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1773         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1774                                       coding->composing);                  \
1775         coding->composition_rule_follows = 0;                              \
1776       }                                                                    \
1777     else                                                                   \
1778       {                                                                    \
1779         /* We are already handling a composition.  If the method is        \
1780            the following two, the codes following the current escape       \
1781            sequence are actual characters stored in a buffer.  */          \
1782         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1783             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1784           {                                                                \
1785             coding->composing = COMPOSITION_RELATIVE;                      \
1786             coding->composition_rule_follows = 0;                          \
1787           }                                                                \
1788       }                                                                    \
1789   } while (0)
1790
1791 /* Handle composition end sequence ESC 1.  */
1792
1793 #define DECODE_COMPOSITION_END(c1)                                      \
1794   do {                                                                  \
1795     if (! COMPOSING_P (coding))                                         \
1796       {                                                                 \
1797         *dst++ = ISO_CODE_ESC;                                          \
1798         *dst++ = c1;                                                    \
1799         coding->produced_char += 2;                                     \
1800       }                                                                 \
1801     else                                                                \
1802       {                                                                 \
1803         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1804         coding->composing = COMPOSITION_NO;                             \
1805       }                                                                 \
1806   } while (0)
1807
1808 /* Decode a composition rule from the byte C1 (and maybe one more byte
1809    from SRC) and store one encoded composition rule in
1810    coding->cmp_data.  */
1811
1812 #define DECODE_COMPOSITION_RULE(c1)                                     \
1813   do {                                                                  \
1814     int rule = 0;                                                       \
1815     (c1) -= 32;                                                         \
1816     if (c1 < 81)                /* old format (before ver.21) */        \
1817       {                                                                 \
1818         int gref = (c1) / 9;                                            \
1819         int nref = (c1) % 9;                                            \
1820         if (gref == 4) gref = 10;                                       \
1821         if (nref == 4) nref = 10;                                       \
1822         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1823       }                                                                 \
1824     else if (c1 < 93)           /* new format (after ver.21) */         \
1825       {                                                                 \
1826         ONE_MORE_BYTE (c2);                                             \
1827         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1828       }                                                                 \
1829     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1830     coding->composition_rule_follows = 0;                               \
1831   } while (0)
1832
1833
1834 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1835
1836 static void
1837 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1838      struct coding_system *coding;
1839      const unsigned char *source;
1840      unsigned char *destination;
1841      int src_bytes, dst_bytes;
1842 {
1843   const unsigned char *src = source;
1844   const unsigned char *src_end = source + src_bytes;
1845   unsigned char *dst = destination;
1846   unsigned char *dst_end = destination + dst_bytes;
1847   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1848   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1849   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1850   /* SRC_BASE remembers the start position in source in each loop.
1851      The loop will be exited when there's not enough source code
1852      (within macro ONE_MORE_BYTE), or when there's not enough
1853      destination area to produce a character (within macro
1854      EMIT_CHAR).  */
1855   const unsigned char *src_base;
1856   int c, charset;
1857   Lisp_Object translation_table;
1858   Lisp_Object safe_chars;
1859
1860   safe_chars = coding_safe_chars (coding->symbol);
1861
1862   if (NILP (Venable_character_translation))
1863     translation_table = Qnil;
1864   else
1865     {
1866       translation_table = coding->translation_table_for_decode;
1867       if (NILP (translation_table))
1868         translation_table = Vstandard_translation_table_for_decode;
1869     }
1870
1871   coding->result = CODING_FINISH_NORMAL;
1872
1873   while (1)
1874     {
1875       int c1, c2 = 0;
1876
1877       src_base = src;
1878       ONE_MORE_BYTE (c1);
1879
1880       /* We produce no character or one character.  */
1881       switch (iso_code_class [c1])
1882         {
1883         case ISO_0x20_or_0x7F:
1884           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1885             {
1886               DECODE_COMPOSITION_RULE (c1);
1887               continue;
1888             }
1889           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1890             {
1891               /* This is SPACE or DEL.  */
1892               charset = CHARSET_ASCII;
1893               break;
1894             }
1895           /* This is a graphic character, we fall down ...  */
1896
1897         case ISO_graphic_plane_0:
1898           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1899             {
1900               DECODE_COMPOSITION_RULE (c1);
1901               continue;
1902             }
1903           charset = charset0;
1904           break;
1905
1906         case ISO_0xA0_or_0xFF:
1907           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1908               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1909             goto label_invalid_code;
1910           /* This is a graphic character, we fall down ... */
1911
1912         case ISO_graphic_plane_1:
1913           if (charset1 < 0)
1914             goto label_invalid_code;
1915           charset = charset1;
1916           break;
1917
1918         case ISO_control_0:
1919           if (COMPOSING_P (coding))
1920             DECODE_COMPOSITION_END ('1');
1921
1922           /* All ISO2022 control characters in this class have the
1923              same representation in Emacs internal format.  */
1924           if (c1 == '\n'
1925               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1926               && (coding->eol_type == CODING_EOL_CR
1927                   || coding->eol_type == CODING_EOL_CRLF))
1928             {
1929               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1930               goto label_end_of_loop;
1931             }
1932           charset = CHARSET_ASCII;
1933           break;
1934
1935         case ISO_control_1:
1936           if (COMPOSING_P (coding))
1937             DECODE_COMPOSITION_END ('1');
1938           goto label_invalid_code;
1939
1940         case ISO_carriage_return:
1941           if (COMPOSING_P (coding))
1942             DECODE_COMPOSITION_END ('1');
1943
1944           if (coding->eol_type == CODING_EOL_CR)
1945             c1 = '\n';
1946           else if (coding->eol_type == CODING_EOL_CRLF)
1947             {
1948               ONE_MORE_BYTE (c1);
1949               if (c1 != ISO_CODE_LF)
1950                 {
1951                   src--;
1952                   c1 = '\r';
1953                 }
1954             }
1955           charset = CHARSET_ASCII;
1956           break;
1957
1958         case ISO_shift_out:
1959           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1960               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1961             goto label_invalid_code;
1962           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1963           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1964           continue;
1965
1966         case ISO_shift_in:
1967           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1968             goto label_invalid_code;
1969           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1970           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1971           continue;
1972
1973         case ISO_single_shift_2_7:
1974         case ISO_single_shift_2:
1975           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1976             goto label_invalid_code;
1977           /* SS2 is handled as an escape sequence of ESC 'N' */
1978           c1 = 'N';
1979           goto label_escape_sequence;
1980
1981         case ISO_single_shift_3:
1982           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1983             goto label_invalid_code;
1984           /* SS2 is handled as an escape sequence of ESC 'O' */
1985           c1 = 'O';
1986           goto label_escape_sequence;
1987
1988         case ISO_control_sequence_introducer:
1989           /* CSI is handled as an escape sequence of ESC '[' ...  */
1990           c1 = '[';
1991           goto label_escape_sequence;
1992
1993         case ISO_escape:
1994           ONE_MORE_BYTE (c1);
1995         label_escape_sequence:
1996           /* Escape sequences handled by Emacs are invocation,
1997              designation, direction specification, and character
1998              composition specification.  */
1999           switch (c1)
2000             {
2001             case '&':           /* revision of following character set */
2002               ONE_MORE_BYTE (c1);
2003               if (!(c1 >= '@' && c1 <= '~'))
2004                 goto label_invalid_code;
2005               ONE_MORE_BYTE (c1);
2006               if (c1 != ISO_CODE_ESC)
2007                 goto label_invalid_code;
2008               ONE_MORE_BYTE (c1);
2009               goto label_escape_sequence;
2010
2011             case '$':           /* designation of 2-byte character set */
2012               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2013                 goto label_invalid_code;
2014               ONE_MORE_BYTE (c1);
2015               if (c1 >= '@' && c1 <= 'B')
2016                 {       /* designation of JISX0208.1978, GB2312.1980,
2017                            or JISX0208.1980 */
2018                   DECODE_DESIGNATION (0, 2, 94, c1);
2019                 }
2020               else if (c1 >= 0x28 && c1 <= 0x2B)
2021                 {       /* designation of DIMENSION2_CHARS94 character set */
2022                   ONE_MORE_BYTE (c2);
2023                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2024                 }
2025               else if (c1 >= 0x2C && c1 <= 0x2F)
2026                 {       /* designation of DIMENSION2_CHARS96 character set */
2027                   ONE_MORE_BYTE (c2);
2028                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2029                 }
2030               else
2031                 goto label_invalid_code;
2032               /* We must update these variables now.  */
2033               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2034               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2035               continue;
2036
2037             case 'n':           /* invocation of locking-shift-2 */
2038               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2039                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2040                 goto label_invalid_code;
2041               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2042               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2043               continue;
2044
2045             case 'o':           /* invocation of locking-shift-3 */
2046               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2047                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2048                 goto label_invalid_code;
2049               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2050               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2051               continue;
2052
2053             case 'N':           /* invocation of single-shift-2 */
2054               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2055                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2056                 goto label_invalid_code;
2057               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2058               ONE_MORE_BYTE (c1);
2059               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2060                 goto label_invalid_code;
2061               break;
2062
2063             case 'O':           /* invocation of single-shift-3 */
2064               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2065                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2066                 goto label_invalid_code;
2067               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2068               ONE_MORE_BYTE (c1);
2069               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2070                 goto label_invalid_code;
2071               break;
2072
2073             case '0': case '2': case '3': case '4': /* start composition */
2074               DECODE_COMPOSITION_START (c1);
2075               continue;
2076
2077             case '1':           /* end composition */
2078               DECODE_COMPOSITION_END (c1);
2079               continue;
2080
2081             case '[':           /* specification of direction */
2082               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2083                 goto label_invalid_code;
2084               /* For the moment, nested direction is not supported.
2085                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2086                  left-to-right, and nonzero means right-to-left.  */
2087               ONE_MORE_BYTE (c1);
2088               switch (c1)
2089                 {
2090                 case ']':       /* end of the current direction */
2091                   coding->mode &= ~CODING_MODE_DIRECTION;
2092
2093                 case '0':       /* end of the current direction */
2094                 case '1':       /* start of left-to-right direction */
2095                   ONE_MORE_BYTE (c1);
2096                   if (c1 == ']')
2097                     coding->mode &= ~CODING_MODE_DIRECTION;
2098                   else
2099                     goto label_invalid_code;
2100                   break;
2101
2102                 case '2':       /* start of right-to-left direction */
2103                   ONE_MORE_BYTE (c1);
2104                   if (c1 == ']')
2105                     coding->mode |= CODING_MODE_DIRECTION;
2106                   else
2107                     goto label_invalid_code;
2108                   break;
2109
2110                 default:
2111                   goto label_invalid_code;
2112                 }
2113               continue;
2114
2115             case '%':
2116               if (COMPOSING_P (coding))
2117                 DECODE_COMPOSITION_END ('1');
2118               ONE_MORE_BYTE (c1);
2119               if (c1 == '/')
2120                 {
2121                   /* CTEXT extended segment:
2122                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2123                      We keep these bytes as is for the moment.
2124                      They may be decoded by post-read-conversion.  */
2125                   int dim, M, L;
2126                   int size, required;
2127                   int produced_chars;
2128
2129                   ONE_MORE_BYTE (dim);
2130                   ONE_MORE_BYTE (M);
2131                   ONE_MORE_BYTE (L);
2132                   size = ((M - 128) * 128) + (L - 128);
2133                   required = 8 + size * 2;
2134                   if (dst + required > (dst_bytes ? dst_end : src))
2135                     goto label_end_of_loop;
2136                   *dst++ = ISO_CODE_ESC;
2137                   *dst++ = '%';
2138                   *dst++ = '/';
2139                   *dst++ = dim;
2140                   produced_chars = 4;
2141                   dst += CHAR_STRING (M, dst), produced_chars++;
2142                   dst += CHAR_STRING (L, dst), produced_chars++;
2143                   while (size-- > 0)
2144                     {
2145                       ONE_MORE_BYTE (c1);
2146                       dst += CHAR_STRING (c1, dst), produced_chars++;
2147                     }
2148                   coding->produced_char += produced_chars;
2149                 }
2150               else if (c1 == 'G')
2151                 {
2152                   unsigned char *d = dst;
2153                   int produced_chars;
2154
2155                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2156                      ESC % G --UTF-8-BYTES-- ESC % @
2157                      We keep these bytes as is for the moment.
2158                      They may be decoded by post-read-conversion.  */
2159                   if (d + 6 > (dst_bytes ? dst_end : src))
2160                     goto label_end_of_loop;
2161                   *d++ = ISO_CODE_ESC;
2162                   *d++ = '%';
2163                   *d++ = 'G';
2164                   produced_chars = 3;
2165                   while (d + 1 < (dst_bytes ? dst_end : src))
2166                     {
2167                       ONE_MORE_BYTE (c1);
2168                       if (c1 == ISO_CODE_ESC
2169                           && src + 1 < src_end
2170                           && src[0] == '%'
2171                           && src[1] == '@')
2172                         {
2173                           src += 2;
2174                           break;
2175                         }
2176                       d += CHAR_STRING (c1, d), produced_chars++;
2177                     }
2178                   if (d + 3 > (dst_bytes ? dst_end : src))
2179                     goto label_end_of_loop;
2180                   *d++ = ISO_CODE_ESC;
2181                   *d++ = '%';
2182                   *d++ = '@';
2183                   dst = d;
2184                   coding->produced_char += produced_chars + 3;
2185                 }
2186               else
2187                 goto label_invalid_code;
2188               continue;
2189
2190             default:
2191               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2192                 goto label_invalid_code;
2193               if (c1 >= 0x28 && c1 <= 0x2B)
2194                 {       /* designation of DIMENSION1_CHARS94 character set */
2195                   ONE_MORE_BYTE (c2);
2196                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2197                 }
2198               else if (c1 >= 0x2C && c1 <= 0x2F)
2199                 {       /* designation of DIMENSION1_CHARS96 character set */
2200                   ONE_MORE_BYTE (c2);
2201                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2202                 }
2203               else
2204                 goto label_invalid_code;
2205               /* We must update these variables now.  */
2206               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2207               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2208               continue;
2209             }
2210         }
2211
2212       /* Now we know CHARSET and 1st position code C1 of a character.
2213          Produce a multibyte sequence for that character while getting
2214          2nd position code C2 if necessary.  */
2215       if (CHARSET_DIMENSION (charset) == 2)
2216         {
2217           ONE_MORE_BYTE (c2);
2218           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2219             /* C2 is not in a valid range.  */
2220             goto label_invalid_code;
2221         }
2222       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2223       EMIT_CHAR (c);
2224       continue;
2225
2226     label_invalid_code:
2227       coding->errors++;
2228       if (COMPOSING_P (coding))
2229         DECODE_COMPOSITION_END ('1');
2230       src = src_base;
2231       c = *src++;
2232       if (! NILP (translation_table))
2233         c = translate_char (translation_table, c, 0, 0, 0);
2234       EMIT_CHAR (c);
2235     }
2236
2237  label_end_of_loop:
2238   coding->consumed = coding->consumed_char = src_base - source;
2239   coding->produced = dst - destination;
2240   return;
2241 }
2242
2243
2244 /* ISO2022 encoding stuff.  */
2245
2246 /*
2247    It is not enough to say just "ISO2022" on encoding, we have to
2248    specify more details.  In Emacs, each ISO2022 coding system
2249    variant has the following specifications:
2250         1. Initial designation to G0 through G3.
2251         2. Allows short-form designation?
2252         3. ASCII should be designated to G0 before control characters?
2253         4. ASCII should be designated to G0 at end of line?
2254         5. 7-bit environment or 8-bit environment?
2255         6. Use locking-shift?
2256         7. Use Single-shift?
2257    And the following two are only for Japanese:
2258         8. Use ASCII in place of JIS0201-1976-Roman?
2259         9. Use JISX0208-1983 in place of JISX0208-1978?
2260    These specifications are encoded in `coding->flags' as flag bits
2261    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2262    details.
2263 */
2264
2265 /* Produce codes (escape sequence) for designating CHARSET to graphic
2266    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2267    '@', 'A', or 'B' and the coding system CODING allows, produce
2268    designation sequence of short-form.  */
2269
2270 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2271   do {                                                                  \
2272     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2273     char *intermediate_char_94 = "()*+";                                \
2274     char *intermediate_char_96 = ",-./";                                \
2275     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2276                                                                         \
2277     if (revision < 255)                                                 \
2278       {                                                                 \
2279         *dst++ = ISO_CODE_ESC;                                          \
2280         *dst++ = '&';                                                   \
2281         *dst++ = '@' + revision;                                        \
2282       }                                                                 \
2283     *dst++ = ISO_CODE_ESC;                                              \
2284     if (CHARSET_DIMENSION (charset) == 1)                               \
2285       {                                                                 \
2286         if (CHARSET_CHARS (charset) == 94)                              \
2287           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2288         else                                                            \
2289           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2290       }                                                                 \
2291     else                                                                \
2292       {                                                                 \
2293         *dst++ = '$';                                                   \
2294         if (CHARSET_CHARS (charset) == 94)                              \
2295           {                                                             \
2296             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2297                 || reg != 0                                             \
2298                 || final_char < '@' || final_char > 'B')                \
2299               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2300           }                                                             \
2301         else                                                            \
2302           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2303       }                                                                 \
2304     *dst++ = final_char;                                                \
2305     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2306   } while (0)
2307
2308 /* The following two macros produce codes (control character or escape
2309    sequence) for ISO2022 single-shift functions (single-shift-2 and
2310    single-shift-3).  */
2311
2312 #define ENCODE_SINGLE_SHIFT_2                           \
2313   do {                                                  \
2314     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2315       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2316     else                                                \
2317       *dst++ = ISO_CODE_SS2;                            \
2318     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2319   } while (0)
2320
2321 #define ENCODE_SINGLE_SHIFT_3                           \
2322   do {                                                  \
2323     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2324       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2325     else                                                \
2326       *dst++ = ISO_CODE_SS3;                            \
2327     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2328   } while (0)
2329
2330 /* The following four macros produce codes (control character or
2331    escape sequence) for ISO2022 locking-shift functions (shift-in,
2332    shift-out, locking-shift-2, and locking-shift-3).  */
2333
2334 #define ENCODE_SHIFT_IN                         \
2335   do {                                          \
2336     *dst++ = ISO_CODE_SI;                       \
2337     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2338   } while (0)
2339
2340 #define ENCODE_SHIFT_OUT                        \
2341   do {                                          \
2342     *dst++ = ISO_CODE_SO;                       \
2343     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2344   } while (0)
2345
2346 #define ENCODE_LOCKING_SHIFT_2                  \
2347   do {                                          \
2348     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2349     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2350   } while (0)
2351
2352 #define ENCODE_LOCKING_SHIFT_3                  \
2353   do {                                          \
2354     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2355     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2356   } while (0)
2357
2358 /* Produce codes for a DIMENSION1 character whose character set is
2359    CHARSET and whose position-code is C1.  Designation and invocation
2360    sequences are also produced in advance if necessary.  */
2361
2362 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2363   do {                                                                  \
2364     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2365       {                                                                 \
2366         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2367           *dst++ = c1 & 0x7F;                                           \
2368         else                                                            \
2369           *dst++ = c1 | 0x80;                                           \
2370         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2371         break;                                                          \
2372       }                                                                 \
2373     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2374       {                                                                 \
2375         *dst++ = c1 & 0x7F;                                             \
2376         break;                                                          \
2377       }                                                                 \
2378     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2379       {                                                                 \
2380         *dst++ = c1 | 0x80;                                             \
2381         break;                                                          \
2382       }                                                                 \
2383     else                                                                \
2384       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2385          must invoke it, or, at first, designate it to some graphic     \
2386          register.  Then repeat the loop to actually produce the        \
2387          character.  */                                                 \
2388       dst = encode_invocation_designation (charset, coding, dst);       \
2389   } while (1)
2390
2391 /* Produce codes for a DIMENSION2 character whose character set is
2392    CHARSET and whose position-codes are C1 and C2.  Designation and
2393    invocation codes are also produced in advance if necessary.  */
2394
2395 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2396   do {                                                                  \
2397     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2398       {                                                                 \
2399         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2400           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2401         else                                                            \
2402           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2403         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2404         break;                                                          \
2405       }                                                                 \
2406     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2407       {                                                                 \
2408         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2409         break;                                                          \
2410       }                                                                 \
2411     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2412       {                                                                 \
2413         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2414         break;                                                          \
2415       }                                                                 \
2416     else                                                                \
2417       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2418          must invoke it, or, at first, designate it to some graphic     \
2419          register.  Then repeat the loop to actually produce the        \
2420          character.  */                                                 \
2421       dst = encode_invocation_designation (charset, coding, dst);       \
2422   } while (1)
2423
2424 #define ENCODE_ISO_CHARACTER(c)                                 \
2425   do {                                                          \
2426     int charset, c1, c2;                                        \
2427                                                                 \
2428     SPLIT_CHAR (c, charset, c1, c2);                            \
2429     if (CHARSET_DEFINED_P (charset))                            \
2430       {                                                         \
2431         if (CHARSET_DIMENSION (charset) == 1)                   \
2432           {                                                     \
2433             if (charset == CHARSET_ASCII                        \
2434                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2435               charset = charset_latin_jisx0201;                 \
2436             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2437           }                                                     \
2438         else                                                    \
2439           {                                                     \
2440             if (charset == charset_jisx0208                     \
2441                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2442               charset = charset_jisx0208_1978;                  \
2443             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2444           }                                                     \
2445       }                                                         \
2446     else                                                        \
2447       {                                                         \
2448         *dst++ = c1;                                            \
2449         if (c2 >= 0)                                            \
2450           *dst++ = c2;                                          \
2451       }                                                         \
2452   } while (0)
2453
2454
2455 /* Instead of encoding character C, produce one or two `?'s.  */
2456
2457 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2458   do {                                                          \
2459     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2460     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2461       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2462   } while (0)
2463
2464
2465 /* Produce designation and invocation codes at a place pointed by DST
2466    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2467    Return new DST.  */
2468
2469 unsigned char *
2470 encode_invocation_designation (charset, coding, dst)
2471      int charset;
2472      struct coding_system *coding;
2473      unsigned char *dst;
2474 {
2475   int reg;                      /* graphic register number */
2476
2477   /* At first, check designations.  */
2478   for (reg = 0; reg < 4; reg++)
2479     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2480       break;
2481
2482   if (reg >= 4)
2483     {
2484       /* CHARSET is not yet designated to any graphic registers.  */
2485       /* At first check the requested designation.  */
2486       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2487       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2488         /* Since CHARSET requests no special designation, designate it
2489            to graphic register 0.  */
2490         reg = 0;
2491
2492       ENCODE_DESIGNATION (charset, reg, coding);
2493     }
2494
2495   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2496       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2497     {
2498       /* Since the graphic register REG is not invoked to any graphic
2499          planes, invoke it to graphic plane 0.  */
2500       switch (reg)
2501         {
2502         case 0:                 /* graphic register 0 */
2503           ENCODE_SHIFT_IN;
2504           break;
2505
2506         case 1:                 /* graphic register 1 */
2507           ENCODE_SHIFT_OUT;
2508           break;
2509
2510         case 2:                 /* graphic register 2 */
2511           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2512             ENCODE_SINGLE_SHIFT_2;
2513           else
2514             ENCODE_LOCKING_SHIFT_2;
2515           break;
2516
2517         case 3:                 /* graphic register 3 */
2518           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2519             ENCODE_SINGLE_SHIFT_3;
2520           else
2521             ENCODE_LOCKING_SHIFT_3;
2522           break;
2523         }
2524     }
2525
2526   return dst;
2527 }
2528
2529 /* Produce 2-byte codes for encoded composition rule RULE.  */
2530
2531 #define ENCODE_COMPOSITION_RULE(rule)           \
2532   do {                                          \
2533     int gref, nref;                             \
2534     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2535     *dst++ = 32 + 81 + gref;                    \
2536     *dst++ = 32 + nref;                         \
2537   } while (0)
2538
2539 /* Produce codes for indicating the start of a composition sequence
2540    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2541    which specify information about the composition.  See the comment
2542    in coding.h for the format of DATA.  */
2543
2544 #define ENCODE_COMPOSITION_START(coding, data)                          \
2545   do {                                                                  \
2546     coding->composing = data[3];                                        \
2547     *dst++ = ISO_CODE_ESC;                                              \
2548     if (coding->composing == COMPOSITION_RELATIVE)                      \
2549       *dst++ = '0';                                                     \
2550     else                                                                \
2551       {                                                                 \
2552         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2553                   ? '3' : '4');                                         \
2554         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2555         coding->composition_rule_follows = 0;                           \
2556       }                                                                 \
2557   } while (0)
2558
2559 /* Produce codes for indicating the end of the current composition.  */
2560
2561 #define ENCODE_COMPOSITION_END(coding, data)                    \
2562   do {                                                          \
2563     *dst++ = ISO_CODE_ESC;                                      \
2564     *dst++ = '1';                                               \
2565     coding->cmp_data_start += data[0];                          \
2566     coding->composing = COMPOSITION_NO;                         \
2567     if (coding->cmp_data_start == coding->cmp_data->used        \
2568         && coding->cmp_data->next)                              \
2569       {                                                         \
2570         coding->cmp_data = coding->cmp_data->next;              \
2571         coding->cmp_data_start = 0;                             \
2572       }                                                         \
2573   } while (0)
2574
2575 /* Produce composition start sequence ESC 0.  Here, this sequence
2576    doesn't mean the start of a new composition but means that we have
2577    just produced components (alternate chars and composition rules) of
2578    the composition and the actual text follows in SRC.  */
2579
2580 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2581   do {                                          \
2582     *dst++ = ISO_CODE_ESC;                      \
2583     *dst++ = '0';                               \
2584     coding->composing = COMPOSITION_RELATIVE;   \
2585   } while (0)
2586
2587 /* The following three macros produce codes for indicating direction
2588    of text.  */
2589 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2590   do {                                                  \
2591     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2592       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2593     else                                                \
2594       *dst++ = ISO_CODE_CSI;                            \
2595   } while (0)
2596
2597 #define ENCODE_DIRECTION_R2L    \
2598   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2599
2600 #define ENCODE_DIRECTION_L2R    \
2601   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2602
2603 /* Produce codes for designation and invocation to reset the graphic
2604    planes and registers to initial state.  */
2605 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2606   do {                                                                      \
2607     int reg;                                                                \
2608     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2609       ENCODE_SHIFT_IN;                                                      \
2610     for (reg = 0; reg < 4; reg++)                                           \
2611       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2612           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2613               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2614         ENCODE_DESIGNATION                                                  \
2615           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2616   } while (0)
2617
2618 /* Produce designation sequences of charsets in the line started from
2619    SRC to a place pointed by DST, and return updated DST.
2620
2621    If the current block ends before any end-of-line, we may fail to
2622    find all the necessary designations.  */
2623
2624 static unsigned char *
2625 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2626      struct coding_system *coding;
2627      Lisp_Object translation_table;
2628      const unsigned char *src, *src_end;
2629      unsigned char *dst;
2630 {
2631   int charset, c, found = 0, reg;
2632   /* Table of charsets to be designated to each graphic register.  */
2633   int r[4];
2634
2635   for (reg = 0; reg < 4; reg++)
2636     r[reg] = -1;
2637
2638   while (found < 4)
2639     {
2640       ONE_MORE_CHAR (c);
2641       if (c == '\n')
2642         break;
2643
2644       charset = CHAR_CHARSET (c);
2645       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2646       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2647         {
2648           found++;
2649           r[reg] = charset;
2650         }
2651     }
2652
2653  label_end_of_loop:
2654   if (found)
2655     {
2656       for (reg = 0; reg < 4; reg++)
2657         if (r[reg] >= 0
2658             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2659           ENCODE_DESIGNATION (r[reg], reg, coding);
2660     }
2661
2662   return dst;
2663 }
2664
2665 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2666
2667 static void
2668 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2669      struct coding_system *coding;
2670      const unsigned char *source;
2671      unsigned char *destination;
2672      int src_bytes, dst_bytes;
2673 {
2674   const unsigned char *src = source;
2675   const unsigned char *src_end = source + src_bytes;
2676   unsigned char *dst = destination;
2677   unsigned char *dst_end = destination + dst_bytes;
2678   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2679      from DST_END to assure overflow checking is necessary only at the
2680      head of loop.  */
2681   unsigned char *adjusted_dst_end = dst_end - 19;
2682   /* SRC_BASE remembers the start position in source in each loop.
2683      The loop will be exited when there's not enough source text to
2684      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2685      there's not enough destination area to produce encoded codes
2686      (within macro EMIT_BYTES).  */
2687   const unsigned char *src_base;
2688   int c;
2689   Lisp_Object translation_table;
2690   Lisp_Object safe_chars;
2691
2692   if (coding->flags & CODING_FLAG_ISO_SAFE)
2693     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2694
2695   safe_chars = coding_safe_chars (coding->symbol);
2696
2697   if (NILP (Venable_character_translation))
2698     translation_table = Qnil;
2699   else
2700     {
2701       translation_table = coding->translation_table_for_encode;
2702       if (NILP (translation_table))
2703         translation_table = Vstandard_translation_table_for_encode;
2704     }
2705
2706   coding->consumed_char = 0;
2707   coding->errors = 0;
2708   while (1)
2709     {
2710       src_base = src;
2711
2712       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2713         {
2714           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2715           break;
2716         }
2717
2718       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2719           && CODING_SPEC_ISO_BOL (coding))
2720         {
2721           /* We have to produce designation sequences if any now.  */
2722           dst = encode_designation_at_bol (coding, translation_table,
2723                                            src, src_end, dst);
2724           CODING_SPEC_ISO_BOL (coding) = 0;
2725         }
2726
2727       /* Check composition start and end.  */
2728       if (coding->composing != COMPOSITION_DISABLED
2729           && coding->cmp_data_start < coding->cmp_data->used)
2730         {
2731           struct composition_data *cmp_data = coding->cmp_data;
2732           int *data = cmp_data->data + coding->cmp_data_start;
2733           int this_pos = cmp_data->char_offset + coding->consumed_char;
2734
2735           if (coding->composing == COMPOSITION_RELATIVE)
2736             {
2737               if (this_pos == data[2])
2738                 {
2739                   ENCODE_COMPOSITION_END (coding, data);
2740                   cmp_data = coding->cmp_data;
2741                   data = cmp_data->data + coding->cmp_data_start;
2742                 }
2743             }
2744           else if (COMPOSING_P (coding))
2745             {
2746               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2747               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2748                 /* We have consumed components of the composition.
2749                    What follows in SRC is the composition's base
2750                    text.  */
2751                 ENCODE_COMPOSITION_FAKE_START (coding);
2752               else
2753                 {
2754                   int c = cmp_data->data[coding->cmp_data_index++];
2755                   if (coding->composition_rule_follows)
2756                     {
2757                       ENCODE_COMPOSITION_RULE (c);
2758                       coding->composition_rule_follows = 0;
2759                     }
2760                   else
2761                     {
2762                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2763                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2764                         ENCODE_UNSAFE_CHARACTER (c);
2765                       else
2766                         ENCODE_ISO_CHARACTER (c);
2767                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2768                         coding->composition_rule_follows = 1;
2769                     }
2770                   continue;
2771                 }
2772             }
2773           if (!COMPOSING_P (coding))
2774             {
2775               if (this_pos == data[1])
2776                 {
2777                   ENCODE_COMPOSITION_START (coding, data);
2778                   continue;
2779                 }
2780             }
2781         }
2782
2783       ONE_MORE_CHAR (c);
2784
2785       /* Now encode the character C.  */
2786       if (c < 0x20 || c == 0x7F)
2787         {
2788           if (c == '\r')
2789             {
2790               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2791                 {
2792                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2793                     ENCODE_RESET_PLANE_AND_REGISTER;
2794                   *dst++ = c;
2795                   continue;
2796                 }
2797               /* fall down to treat '\r' as '\n' ...  */
2798               c = '\n';
2799             }
2800           if (c == '\n')
2801             {
2802               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2803                 ENCODE_RESET_PLANE_AND_REGISTER;
2804               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2805                 bcopy (coding->spec.iso2022.initial_designation,
2806                        coding->spec.iso2022.current_designation,
2807                        sizeof coding->spec.iso2022.initial_designation);
2808               if (coding->eol_type == CODING_EOL_LF
2809                   || coding->eol_type == CODING_EOL_UNDECIDED)
2810                 *dst++ = ISO_CODE_LF;
2811               else if (coding->eol_type == CODING_EOL_CRLF)
2812                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2813               else
2814                 *dst++ = ISO_CODE_CR;
2815               CODING_SPEC_ISO_BOL (coding) = 1;
2816             }
2817           else
2818             {
2819               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2820                 ENCODE_RESET_PLANE_AND_REGISTER;
2821               *dst++ = c;
2822             }
2823         }
2824       else if (ASCII_BYTE_P (c))
2825         ENCODE_ISO_CHARACTER (c);
2826       else if (SINGLE_BYTE_CHAR_P (c))
2827         {
2828           *dst++ = c;
2829           coding->errors++;
2830         }
2831       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2832                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2833         ENCODE_UNSAFE_CHARACTER (c);
2834       else
2835         ENCODE_ISO_CHARACTER (c);
2836
2837       coding->consumed_char++;
2838     }
2839
2840  label_end_of_loop:
2841   coding->consumed = src_base - source;
2842   coding->produced = coding->produced_char = dst - destination;
2843 }
2844
2845 \f
2846 /*** 4. SJIS and BIG5 handlers ***/
2847
2848 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2849    quite widely.  So, for the moment, Emacs supports them in the bare
2850    C code.  But, in the future, they may be supported only by CCL.  */
2851
2852 /* SJIS is a coding system encoding three character sets: ASCII, right
2853    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2854    as is.  A character of charset katakana-jisx0201 is encoded by
2855    "position-code + 0x80".  A character of charset japanese-jisx0208
2856    is encoded in 2-byte but two position-codes are divided and shifted
2857    so that it fits in the range below.
2858
2859    --- CODE RANGE of SJIS ---
2860    (character set)      (range)
2861    ASCII                0x00 .. 0x7F
2862    KATAKANA-JISX0201    0xA1 .. 0xDF
2863    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2864             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2865    -------------------------------
2866
2867 */
2868
2869 /* BIG5 is a coding system encoding two character sets: ASCII and
2870    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2871    character set and is encoded in two bytes.
2872
2873    --- CODE RANGE of BIG5 ---
2874    (character set)      (range)
2875    ASCII                0x00 .. 0x7F
2876    Big5 (1st byte)      0xA1 .. 0xFE
2877         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2878    --------------------------
2879
2880    Since the number of characters in Big5 is larger than maximum
2881    characters in Emacs' charset (96x96), it can't be handled as one
2882    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2883    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2884    contains frequently used characters and the latter contains less
2885    frequently used characters.  */
2886
2887 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2888    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2889    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2890    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2891
2892 /* Number of Big5 characters which have the same code in 1st byte.  */
2893 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2894
2895 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2896   do {                                                                  \
2897     unsigned int temp                                                   \
2898       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2899     if (b1 < 0xC9)                                                      \
2900       charset = charset_big5_1;                                         \
2901     else                                                                \
2902       {                                                                 \
2903         charset = charset_big5_2;                                       \
2904         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2905       }                                                                 \
2906     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2907     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2908   } while (0)
2909
2910 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2911   do {                                                                  \
2912     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2913     if (charset == charset_big5_2)                                      \
2914       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2915     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2916     b2 = temp % BIG5_SAME_ROW;                                          \
2917     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2918   } while (0)
2919
2920 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2921    Check if a text is encoded in SJIS.  If it is, return
2922    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2923
2924 static int
2925 detect_coding_sjis (src, src_end, multibytep)
2926      unsigned char *src, *src_end;
2927      int multibytep;
2928 {
2929   int c;
2930   /* Dummy for ONE_MORE_BYTE.  */
2931   struct coding_system dummy_coding;
2932   struct coding_system *coding = &dummy_coding;
2933
2934   while (1)
2935     {
2936       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2937       if (c < 0x80)
2938         continue;
2939       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2940         return 0;
2941       if (c <= 0x9F || c >= 0xE0)
2942         {
2943           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2944           if (c < 0x40 || c == 0x7F || c > 0xFC)
2945             return 0;
2946         }
2947     }
2948 }
2949
2950 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2951    Check if a text is encoded in BIG5.  If it is, return
2952    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2953
2954 static int
2955 detect_coding_big5 (src, src_end, multibytep)
2956      unsigned char *src, *src_end;
2957      int multibytep;
2958 {
2959   int c;
2960   /* Dummy for ONE_MORE_BYTE.  */
2961   struct coding_system dummy_coding;
2962   struct coding_system *coding = &dummy_coding;
2963
2964   while (1)
2965     {
2966       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2967       if (c < 0x80)
2968         continue;
2969       if (c < 0xA1 || c > 0xFE)
2970         return 0;
2971       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2972       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2973         return 0;
2974     }
2975 }
2976
2977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2978    Check if a text is encoded in UTF-8.  If it is, return
2979    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2980
2981 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2982 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2983 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2984 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2985 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2986 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2987 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2988
2989 static int
2990 detect_coding_utf_8 (src, src_end, multibytep)
2991      unsigned char *src, *src_end;
2992      int multibytep;
2993 {
2994   unsigned char c;
2995   int seq_maybe_bytes;
2996   /* Dummy for ONE_MORE_BYTE.  */
2997   struct coding_system dummy_coding;
2998   struct coding_system *coding = &dummy_coding;
2999
3000   while (1)
3001     {
3002       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
3003       if (UTF_8_1_OCTET_P (c))
3004         continue;
3005       else if (UTF_8_2_OCTET_LEADING_P (c))
3006         seq_maybe_bytes = 1;
3007       else if (UTF_8_3_OCTET_LEADING_P (c))
3008         seq_maybe_bytes = 2;
3009       else if (UTF_8_4_OCTET_LEADING_P (c))
3010         seq_maybe_bytes = 3;
3011       else if (UTF_8_5_OCTET_LEADING_P (c))
3012         seq_maybe_bytes = 4;
3013       else if (UTF_8_6_OCTET_LEADING_P (c))
3014         seq_maybe_bytes = 5;
3015       else
3016         return 0;
3017
3018       do
3019         {
3020           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3021           if (!UTF_8_EXTRA_OCTET_P (c))
3022             return 0;
3023           seq_maybe_bytes--;
3024         }
3025       while (seq_maybe_bytes > 0);
3026     }
3027 }
3028
3029 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3030    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3031    Little Endian (otherwise).  If it is, return
3032    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3033    else return 0.  */
3034
3035 #define UTF_16_INVALID_P(val)   \
3036   (((val) == 0xFFFE)            \
3037    || ((val) == 0xFFFF))
3038
3039 #define UTF_16_HIGH_SURROGATE_P(val) \
3040   (((val) & 0xD800) == 0xD800)
3041
3042 #define UTF_16_LOW_SURROGATE_P(val) \
3043   (((val) & 0xDC00) == 0xDC00)
3044
3045 static int
3046 detect_coding_utf_16 (src, src_end, multibytep)
3047      unsigned char *src, *src_end;
3048      int multibytep;
3049 {
3050   unsigned char c1, c2;
3051   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3052   struct coding_system dummy_coding;
3053   struct coding_system *coding = &dummy_coding;
3054
3055   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3056   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3057
3058   if ((c1 == 0xFF) && (c2 == 0xFE))
3059     return CODING_CATEGORY_MASK_UTF_16_LE;
3060   else if ((c1 == 0xFE) && (c2 == 0xFF))
3061     return CODING_CATEGORY_MASK_UTF_16_BE;
3062   return 0;
3063 }
3064
3065 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3066    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3067
3068 static void
3069 decode_coding_sjis_big5 (coding, source, destination,
3070                          src_bytes, dst_bytes, sjis_p)
3071      struct coding_system *coding;
3072      const unsigned char *source;
3073      unsigned char  *destination;
3074      int src_bytes, dst_bytes;
3075      int sjis_p;
3076 {
3077   const unsigned char *src = source;
3078   const unsigned char *src_end = source + src_bytes;
3079   unsigned char *dst = destination;
3080   unsigned char *dst_end = destination + dst_bytes;
3081   /* SRC_BASE remembers the start position in source in each loop.
3082      The loop will be exited when there's not enough source code
3083      (within macro ONE_MORE_BYTE), or when there's not enough
3084      destination area to produce a character (within macro
3085      EMIT_CHAR).  */
3086   const unsigned char *src_base;
3087   Lisp_Object translation_table;
3088
3089   if (NILP (Venable_character_translation))
3090     translation_table = Qnil;
3091   else
3092     {
3093       translation_table = coding->translation_table_for_decode;
3094       if (NILP (translation_table))
3095         translation_table = Vstandard_translation_table_for_decode;
3096     }
3097
3098   coding->produced_char = 0;
3099   while (1)
3100     {
3101       int c, charset, c1, c2 = 0;
3102
3103       src_base = src;
3104       ONE_MORE_BYTE (c1);
3105
3106       if (c1 < 0x80)
3107         {
3108           charset = CHARSET_ASCII;
3109           if (c1 < 0x20)
3110             {
3111               if (c1 == '\r')
3112                 {
3113                   if (coding->eol_type == CODING_EOL_CRLF)
3114                     {
3115                       ONE_MORE_BYTE (c2);
3116                       if (c2 == '\n')
3117                         c1 = c2;
3118                       else
3119                         /* To process C2 again, SRC is subtracted by 1.  */
3120                         src--;
3121                     }
3122                   else if (coding->eol_type == CODING_EOL_CR)
3123                     c1 = '\n';
3124                 }
3125               else if (c1 == '\n'
3126                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3127                        && (coding->eol_type == CODING_EOL_CR
3128                            || coding->eol_type == CODING_EOL_CRLF))
3129                 {
3130                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3131                   goto label_end_of_loop;
3132                 }
3133             }
3134         }
3135       else
3136         {
3137           if (sjis_p)
3138             {
3139               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3140                 goto label_invalid_code;
3141               if (c1 <= 0x9F || c1 >= 0xE0)
3142                 {
3143                   /* SJIS -> JISX0208 */
3144                   ONE_MORE_BYTE (c2);
3145                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3146                     goto label_invalid_code;
3147                   DECODE_SJIS (c1, c2, c1, c2);
3148                   charset = charset_jisx0208;
3149                 }
3150               else
3151                 /* SJIS -> JISX0201-Kana */
3152                 charset = charset_katakana_jisx0201;
3153             }
3154           else
3155             {
3156               /* BIG5 -> Big5 */
3157               if (c1 < 0xA0 || c1 > 0xFE)
3158                 goto label_invalid_code;
3159               ONE_MORE_BYTE (c2);
3160               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3161                 goto label_invalid_code;
3162               DECODE_BIG5 (c1, c2, charset, c1, c2);
3163             }
3164         }
3165
3166       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3167       EMIT_CHAR (c);
3168       continue;
3169
3170     label_invalid_code:
3171       coding->errors++;
3172       src = src_base;
3173       c = *src++;
3174       EMIT_CHAR (c);
3175     }
3176
3177  label_end_of_loop:
3178   coding->consumed = coding->consumed_char = src_base - source;
3179   coding->produced = dst - destination;
3180   return;
3181 }
3182
3183 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3184    This function can encode charsets `ascii', `katakana-jisx0201',
3185    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3186    are sure that all these charsets are registered as official charset
3187    (i.e. do not have extended leading-codes).  Characters of other
3188    charsets are produced without any encoding.  If SJIS_P is 1, encode
3189    SJIS text, else encode BIG5 text.  */
3190
3191 static void
3192 encode_coding_sjis_big5 (coding, source, destination,
3193                          src_bytes, dst_bytes, sjis_p)
3194      struct coding_system *coding;
3195      unsigned char *source, *destination;
3196      int src_bytes, dst_bytes;
3197      int sjis_p;
3198 {
3199   unsigned char *src = source;
3200   unsigned char *src_end = source + src_bytes;
3201   unsigned char *dst = destination;
3202   unsigned char *dst_end = destination + dst_bytes;
3203   /* SRC_BASE remembers the start position in source in each loop.
3204      The loop will be exited when there's not enough source text to
3205      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3206      there's not enough destination area to produce encoded codes
3207      (within macro EMIT_BYTES).  */
3208   unsigned char *src_base;
3209   Lisp_Object translation_table;
3210
3211   if (NILP (Venable_character_translation))
3212     translation_table = Qnil;
3213   else
3214     {
3215       translation_table = coding->translation_table_for_encode;
3216       if (NILP (translation_table))
3217         translation_table = Vstandard_translation_table_for_encode;
3218     }
3219
3220   while (1)
3221     {
3222       int c, charset, c1, c2;
3223
3224       src_base = src;
3225       ONE_MORE_CHAR (c);
3226
3227       /* Now encode the character C.  */
3228       if (SINGLE_BYTE_CHAR_P (c))
3229         {
3230           switch (c)
3231             {
3232             case '\r':
3233               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3234                 {
3235                   EMIT_ONE_BYTE (c);
3236                   break;
3237                 }
3238               c = '\n';
3239             case '\n':
3240               if (coding->eol_type == CODING_EOL_CRLF)
3241                 {
3242                   EMIT_TWO_BYTES ('\r', c);
3243                   break;
3244                 }
3245               else if (coding->eol_type == CODING_EOL_CR)
3246                 c = '\r';
3247             default:
3248               EMIT_ONE_BYTE (c);
3249             }
3250         }
3251       else
3252         {
3253           SPLIT_CHAR (c, charset, c1, c2);
3254           if (sjis_p)
3255             {
3256               if (charset == charset_jisx0208
3257                   || charset == charset_jisx0208_1978)
3258                 {
3259                   ENCODE_SJIS (c1, c2, c1, c2);
3260                   EMIT_TWO_BYTES (c1, c2);
3261                 }
3262               else if (charset == charset_katakana_jisx0201)
3263                 EMIT_ONE_BYTE (c1 | 0x80);
3264               else if (charset == charset_latin_jisx0201)
3265                 EMIT_ONE_BYTE (c1);
3266               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3267                 {
3268                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3269                   if (CHARSET_WIDTH (charset) > 1)
3270                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3271                 }
3272               else
3273                 /* There's no way other than producing the internal
3274                    codes as is.  */
3275                 EMIT_BYTES (src_base, src);
3276             }
3277           else
3278             {
3279               if (charset == charset_big5_1 || charset == charset_big5_2)
3280                 {
3281                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3282                   EMIT_TWO_BYTES (c1, c2);
3283                 }
3284               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3285                 {
3286                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3287                   if (CHARSET_WIDTH (charset) > 1)
3288                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3289                 }
3290               else
3291                 /* There's no way other than producing the internal
3292                    codes as is.  */
3293                 EMIT_BYTES (src_base, src);
3294             }
3295         }
3296       coding->consumed_char++;
3297     }
3298
3299  label_end_of_loop:
3300   coding->consumed = src_base - source;
3301   coding->produced = coding->produced_char = dst - destination;
3302 }
3303
3304 \f
3305 /*** 5. CCL handlers ***/
3306
3307 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3308    Check if a text is encoded in a coding system of which
3309    encoder/decoder are written in CCL program.  If it is, return
3310    CODING_CATEGORY_MASK_CCL, else return 0.  */
3311
3312 static int
3313 detect_coding_ccl (src, src_end, multibytep)
3314      unsigned char *src, *src_end;
3315      int multibytep;
3316 {
3317   unsigned char *valid;
3318   int c;
3319   /* Dummy for ONE_MORE_BYTE.  */
3320   struct coding_system dummy_coding;
3321   struct coding_system *coding = &dummy_coding;
3322
3323   /* No coding system is assigned to coding-category-ccl.  */
3324   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3325     return 0;
3326
3327   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3328   while (1)
3329     {
3330       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3331       if (! valid[c])
3332         return 0;
3333     }
3334 }
3335
3336 \f
3337 /*** 6. End-of-line handlers ***/
3338
3339 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3340
3341 static void
3342 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3343      struct coding_system *coding;
3344      const unsigned char *source;
3345      unsigned char *destination;
3346      int src_bytes, dst_bytes;
3347 {
3348   const unsigned char *src = source;
3349   unsigned char *dst = destination;
3350   const unsigned char *src_end = src + src_bytes;
3351   unsigned char *dst_end = dst + dst_bytes;
3352   Lisp_Object translation_table;
3353   /* SRC_BASE remembers the start position in source in each loop.
3354      The loop will be exited when there's not enough source code
3355      (within macro ONE_MORE_BYTE), or when there's not enough
3356      destination area to produce a character (within macro
3357      EMIT_CHAR).  */
3358   const unsigned char *src_base;
3359   int c;
3360
3361   translation_table = Qnil;
3362   switch (coding->eol_type)
3363     {
3364     case CODING_EOL_CRLF:
3365       while (1)
3366         {
3367           src_base = src;
3368           ONE_MORE_BYTE (c);
3369           if (c == '\r')
3370             {
3371               ONE_MORE_BYTE (c);
3372               if (c != '\n')
3373                 {
3374                   src--;
3375                   c = '\r';
3376                 }
3377             }
3378           else if (c == '\n'
3379                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3380             {
3381               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3382               goto label_end_of_loop;
3383             }
3384           EMIT_CHAR (c);
3385         }
3386       break;
3387
3388     case CODING_EOL_CR:
3389       while (1)
3390         {
3391           src_base = src;
3392           ONE_MORE_BYTE (c);
3393           if (c == '\n')
3394             {
3395               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3396                 {
3397                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3398                   goto label_end_of_loop;
3399                 }
3400             }
3401           else if (c == '\r')
3402             c = '\n';
3403           EMIT_CHAR (c);
3404         }
3405       break;
3406
3407     default:                    /* no need for EOL handling */
3408       while (1)
3409         {
3410           src_base = src;
3411           ONE_MORE_BYTE (c);
3412           EMIT_CHAR (c);
3413         }
3414     }
3415
3416  label_end_of_loop:
3417   coding->consumed = coding->consumed_char = src_base - source;
3418   coding->produced = dst - destination;
3419   return;
3420 }
3421
3422 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3423    format of end-of-line according to `coding->eol_type'.  It also
3424    convert multibyte form 8-bit characters to unibyte if
3425    CODING->src_multibyte is nonzero.  If `coding->mode &
3426    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3427    also means end-of-line.  */
3428
3429 static void
3430 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3431      struct coding_system *coding;
3432      const unsigned char *source;
3433      unsigned char *destination;
3434      int src_bytes, dst_bytes;
3435 {
3436   const unsigned char *src = source;
3437   unsigned char *dst = destination;
3438   const unsigned char *src_end = src + src_bytes;
3439   unsigned char *dst_end = dst + dst_bytes;
3440   Lisp_Object translation_table;
3441   /* SRC_BASE remembers the start position in source in each loop.
3442      The loop will be exited when there's not enough source text to
3443      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3444      there's not enough destination area to produce encoded codes
3445      (within macro EMIT_BYTES).  */
3446   const unsigned char *src_base;
3447   unsigned char *tmp;
3448   int c;
3449   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3450
3451   translation_table = Qnil;
3452   if (coding->src_multibyte
3453       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3454     {
3455       src_end--;
3456       src_bytes--;
3457       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3458     }
3459
3460   if (coding->eol_type == CODING_EOL_CRLF)
3461     {
3462       while (src < src_end)
3463         {
3464           src_base = src;
3465           c = *src++;
3466           if (c >= 0x20)
3467             EMIT_ONE_BYTE (c);
3468           else if (c == '\n' || (c == '\r' && selective_display))
3469             EMIT_TWO_BYTES ('\r', '\n');
3470           else
3471             EMIT_ONE_BYTE (c);
3472         }
3473       src_base = src;
3474     label_end_of_loop:
3475       ;
3476     }
3477   else
3478     {
3479       if (!dst_bytes || src_bytes <= dst_bytes)
3480         {
3481           safe_bcopy (src, dst, src_bytes);
3482           src_base = src_end;
3483           dst += src_bytes;
3484         }
3485       else
3486         {
3487           if (coding->src_multibyte
3488               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3489             dst_bytes--;
3490           safe_bcopy (src, dst, dst_bytes);
3491           src_base = src + dst_bytes;
3492           dst = destination + dst_bytes;
3493           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3494         }
3495       if (coding->eol_type == CODING_EOL_CR)
3496         {
3497           for (tmp = destination; tmp < dst; tmp++)
3498             if (*tmp == '\n') *tmp = '\r';
3499         }
3500       else if (selective_display)
3501         {
3502           for (tmp = destination; tmp < dst; tmp++)
3503             if (*tmp == '\r') *tmp = '\n';
3504         }
3505     }
3506   if (coding->src_multibyte)
3507     dst = destination + str_as_unibyte (destination, dst - destination);
3508
3509   coding->consumed = src_base - source;
3510   coding->produced = dst - destination;
3511   coding->produced_char = coding->produced;
3512 }
3513
3514 \f
3515 /*** 7. C library functions ***/
3516
3517 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3518    has a property `coding-system'.  The value of this property is a
3519    vector of length 5 (called the coding-vector).  Among elements of
3520    this vector, the first (element[0]) and the fifth (element[4])
3521    carry important information for decoding/encoding.  Before
3522    decoding/encoding, this information should be set in fields of a
3523    structure of type `coding_system'.
3524
3525    The value of the property `coding-system' can be a symbol of another
3526    subsidiary coding-system.  In that case, Emacs gets coding-vector
3527    from that symbol.
3528
3529    `element[0]' contains information to be set in `coding->type'.  The
3530    value and its meaning is as follows:
3531
3532    0 -- coding_type_emacs_mule
3533    1 -- coding_type_sjis
3534    2 -- coding_type_iso2022
3535    3 -- coding_type_big5
3536    4 -- coding_type_ccl encoder/decoder written in CCL
3537    nil -- coding_type_no_conversion
3538    t -- coding_type_undecided (automatic conversion on decoding,
3539                                no-conversion on encoding)
3540
3541    `element[4]' contains information to be set in `coding->flags' and
3542    `coding->spec'.  The meaning varies by `coding->type'.
3543
3544    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3545    of length 32 (of which the first 13 sub-elements are used now).
3546    Meanings of these sub-elements are:
3547
3548    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3549         If the value is an integer of valid charset, the charset is
3550         assumed to be designated to graphic register N initially.
3551
3552         If the value is minus, it is a minus value of charset which
3553         reserves graphic register N, which means that the charset is
3554         not designated initially but should be designated to graphic
3555         register N just before encoding a character in that charset.
3556
3557         If the value is nil, graphic register N is never used on
3558         encoding.
3559
3560    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3561         Each value takes t or nil.  See the section ISO2022 of
3562         `coding.h' for more information.
3563
3564    If `coding->type' is `coding_type_big5', element[4] is t to denote
3565    BIG5-ETen or nil to denote BIG5-HKU.
3566
3567    If `coding->type' takes the other value, element[4] is ignored.
3568
3569    Emacs Lisp's coding systems also carry information about format of
3570    end-of-line in a value of property `eol-type'.  If the value is
3571    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3572    means CODING_EOL_CR.  If it is not integer, it should be a vector
3573    of subsidiary coding systems of which property `eol-type' has one
3574    of the above values.
3575
3576 */
3577
3578 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3579    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3580    is setup so that no conversion is necessary and return -1, else
3581    return 0.  */
3582
3583 int
3584 setup_coding_system (coding_system, coding)
3585      Lisp_Object coding_system;
3586      struct coding_system *coding;
3587 {
3588   Lisp_Object coding_spec, coding_type, eol_type, plist;
3589   Lisp_Object val;
3590
3591   /* At first, zero clear all members.  */
3592   bzero (coding, sizeof (struct coding_system));
3593
3594   /* Initialize some fields required for all kinds of coding systems.  */
3595   coding->symbol = coding_system;
3596   coding->heading_ascii = -1;
3597   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3598   coding->composing = COMPOSITION_DISABLED;
3599   coding->cmp_data = NULL;
3600
3601   if (NILP (coding_system))
3602     goto label_invalid_coding_system;
3603
3604   coding_spec = Fget (coding_system, Qcoding_system);
3605
3606   if (!VECTORP (coding_spec)
3607       || XVECTOR (coding_spec)->size != 5
3608       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3609     goto label_invalid_coding_system;
3610
3611   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3612   if (VECTORP (eol_type))
3613     {
3614       coding->eol_type = CODING_EOL_UNDECIDED;
3615       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3616       if (system_eol_type != CODING_EOL_LF)
3617         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3618     }
3619   else if (XFASTINT (eol_type) == 1)
3620     {
3621       coding->eol_type = CODING_EOL_CRLF;
3622       coding->common_flags
3623         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3624     }
3625   else if (XFASTINT (eol_type) == 2)
3626     {
3627       coding->eol_type = CODING_EOL_CR;
3628       coding->common_flags
3629         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3630     }
3631   else
3632     {
3633       coding->common_flags = 0;
3634       coding->eol_type = CODING_EOL_LF;
3635     }
3636
3637   coding_type = XVECTOR (coding_spec)->contents[0];
3638   /* Try short cut.  */
3639   if (SYMBOLP (coding_type))
3640     {
3641       if (EQ (coding_type, Qt))
3642         {
3643           coding->type = coding_type_undecided;
3644           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3645         }
3646       else
3647         coding->type = coding_type_no_conversion;
3648       /* Initialize this member.  Any thing other than
3649          CODING_CATEGORY_IDX_UTF_16_BE and
3650          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3651          special treatment in detect_eol.  */
3652       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3653
3654       return 0;
3655     }
3656
3657   /* Get values of coding system properties:
3658      `post-read-conversion', `pre-write-conversion',
3659      `translation-table-for-decode', `translation-table-for-encode'.  */
3660   plist = XVECTOR (coding_spec)->contents[3];
3661   /* Pre & post conversion functions should be disabled if
3662      inhibit_eol_conversion is nonzero.  This is the case that a code
3663      conversion function is called while those functions are running.  */
3664   if (! inhibit_pre_post_conversion)
3665     {
3666       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3667       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3668     }
3669   val = Fplist_get (plist, Qtranslation_table_for_decode);
3670   if (SYMBOLP (val))
3671     val = Fget (val, Qtranslation_table_for_decode);
3672   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3673   val = Fplist_get (plist, Qtranslation_table_for_encode);
3674   if (SYMBOLP (val))
3675     val = Fget (val, Qtranslation_table_for_encode);
3676   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3677   val = Fplist_get (plist, Qcoding_category);
3678   if (!NILP (val))
3679     {
3680       val = Fget (val, Qcoding_category_index);
3681       if (INTEGERP (val))
3682         coding->category_idx = XINT (val);
3683       else
3684         goto label_invalid_coding_system;
3685     }
3686   else
3687     goto label_invalid_coding_system;
3688
3689   /* If the coding system has non-nil `composition' property, enable
3690      composition handling.  */
3691   val = Fplist_get (plist, Qcomposition);
3692   if (!NILP (val))
3693     coding->composing = COMPOSITION_NO;
3694
3695   /* If the coding system is ascii-incompatible, record it in
3696      common_flags.   */
3697   val = Fplist_get (plist, Qascii_incompatible);
3698   if (! NILP (val))
3699     coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3700
3701   switch (XFASTINT (coding_type))
3702     {
3703     case 0:
3704       coding->type = coding_type_emacs_mule;
3705       coding->common_flags
3706         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3707       if (!NILP (coding->post_read_conversion))
3708         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3709       if (!NILP (coding->pre_write_conversion))
3710         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3711       break;
3712
3713     case 1:
3714       coding->type = coding_type_sjis;
3715       coding->common_flags
3716         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3717       break;
3718
3719     case 2:
3720       coding->type = coding_type_iso2022;
3721       coding->common_flags
3722         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3723       {
3724         Lisp_Object val, temp;
3725         Lisp_Object *flags;
3726         int i, charset, reg_bits = 0;
3727
3728         val = XVECTOR (coding_spec)->contents[4];
3729
3730         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3731           goto label_invalid_coding_system;
3732
3733         flags = XVECTOR (val)->contents;
3734         coding->flags
3735           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3736              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3737              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3738              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3739              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3740              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3741              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3742              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3743              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3744              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3745              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3746              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3747              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3748              );
3749
3750         /* Invoke graphic register 0 to plane 0.  */
3751         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3752         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3753         CODING_SPEC_ISO_INVOCATION (coding, 1)
3754           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3755         /* Not single shifting at first.  */
3756         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3757         /* Beginning of buffer should also be regarded as bol. */
3758         CODING_SPEC_ISO_BOL (coding) = 1;
3759
3760         for (charset = 0; charset <= MAX_CHARSET; charset++)
3761           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3762         val = Vcharset_revision_alist;
3763         while (CONSP (val))
3764           {
3765             charset = get_charset_id (Fcar_safe (XCAR (val)));
3766             if (charset >= 0
3767                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3768                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3769               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3770             val = XCDR (val);
3771           }
3772
3773         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3774            FLAGS[REG] can be one of below:
3775                 integer CHARSET: CHARSET occupies register I,
3776                 t: designate nothing to REG initially, but can be used
3777                   by any charsets,
3778                 list of integer, nil, or t: designate the first
3779                   element (if integer) to REG initially, the remaining
3780                   elements (if integer) is designated to REG on request,
3781                   if an element is t, REG can be used by any charsets,
3782                 nil: REG is never used.  */
3783         for (charset = 0; charset <= MAX_CHARSET; charset++)
3784           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3785             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3786         for (i = 0; i < 4; i++)
3787           {
3788             if ((INTEGERP (flags[i])
3789                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3790                 || (charset = get_charset_id (flags[i])) >= 0)
3791               {
3792                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3793                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3794               }
3795             else if (EQ (flags[i], Qt))
3796               {
3797                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3798                 reg_bits |= 1 << i;
3799                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3800               }
3801             else if (CONSP (flags[i]))
3802               {
3803                 Lisp_Object tail;
3804                 tail = flags[i];
3805
3806                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3807                 if ((INTEGERP (XCAR (tail))
3808                      && (charset = XINT (XCAR (tail)),
3809                          CHARSET_VALID_P (charset)))
3810                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3811                   {
3812                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3813                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3814                   }
3815                 else
3816                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3817                 tail = XCDR (tail);
3818                 while (CONSP (tail))
3819                   {
3820                     if ((INTEGERP (XCAR (tail))
3821                          && (charset = XINT (XCAR (tail)),
3822                              CHARSET_VALID_P (charset)))
3823                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3824                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3825                         = i;
3826                     else if (EQ (XCAR (tail), Qt))
3827                       reg_bits |= 1 << i;
3828                     tail = XCDR (tail);
3829                   }
3830               }
3831             else
3832               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3833
3834             CODING_SPEC_ISO_DESIGNATION (coding, i)
3835               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3836           }
3837
3838         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3839           {
3840             /* REG 1 can be used only by locking shift in 7-bit env.  */
3841             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3842               reg_bits &= ~2;
3843             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3844               /* Without any shifting, only REG 0 and 1 can be used.  */
3845               reg_bits &= 3;
3846           }
3847
3848         if (reg_bits)
3849           for (charset = 0; charset <= MAX_CHARSET; charset++)
3850             {
3851               if (CHARSET_DEFINED_P (charset)
3852                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3853                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3854                 {
3855                   /* There exist some default graphic registers to be
3856                      used by CHARSET.  */
3857
3858                   /* We had better avoid designating a charset of
3859                      CHARS96 to REG 0 as far as possible.  */
3860                   if (CHARSET_CHARS (charset) == 96)
3861                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3862                       = (reg_bits & 2
3863                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3864                   else
3865                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3866                       = (reg_bits & 1
3867                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3868                 }
3869             }
3870       }
3871       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3872       coding->spec.iso2022.last_invalid_designation_register = -1;
3873       break;
3874
3875     case 3:
3876       coding->type = coding_type_big5;
3877       coding->common_flags
3878         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3879       coding->flags
3880         = (NILP (XVECTOR (coding_spec)->contents[4])
3881            ? CODING_FLAG_BIG5_HKU
3882            : CODING_FLAG_BIG5_ETEN);
3883       break;
3884
3885     case 4:
3886       coding->type = coding_type_ccl;
3887       coding->common_flags
3888         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3889       {
3890         val = XVECTOR (coding_spec)->contents[4];
3891         if (! CONSP (val)
3892             || setup_ccl_program (&(coding->spec.ccl.decoder),
3893                                   XCAR (val)) < 0
3894             || setup_ccl_program (&(coding->spec.ccl.encoder),
3895                                   XCDR (val)) < 0)
3896           goto label_invalid_coding_system;
3897
3898         bzero (coding->spec.ccl.valid_codes, 256);
3899         val = Fplist_get (plist, Qvalid_codes);
3900         if (CONSP (val))
3901           {
3902             Lisp_Object this;
3903
3904             for (; CONSP (val); val = XCDR (val))
3905               {
3906                 this = XCAR (val);
3907                 if (INTEGERP (this)
3908                     && XINT (this) >= 0 && XINT (this) < 256)
3909                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3910                 else if (CONSP (this)
3911                          && INTEGERP (XCAR (this))
3912                          && INTEGERP (XCDR (this)))
3913                   {
3914                     int start = XINT (XCAR (this));
3915                     int end = XINT (XCDR (this));
3916
3917                     if (start >= 0 && start <= end && end < 256)
3918                       while (start <= end)
3919                         coding->spec.ccl.valid_codes[start++] = 1;
3920                   }
3921               }
3922           }
3923       }
3924       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3925       coding->spec.ccl.cr_carryover = 0;
3926       coding->spec.ccl.eight_bit_carryover[0] = 0;
3927       break;
3928
3929     case 5:
3930       coding->type = coding_type_raw_text;
3931       break;
3932
3933     default:
3934       goto label_invalid_coding_system;
3935     }
3936   return 0;
3937
3938  label_invalid_coding_system:
3939   coding->type = coding_type_no_conversion;
3940   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3941   coding->common_flags = 0;
3942   coding->eol_type = CODING_EOL_UNDECIDED;
3943   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3944   return NILP (coding_system) ? 0 : -1;
3945 }
3946
3947 /* Free memory blocks allocated for storing composition information.  */
3948
3949 void
3950 coding_free_composition_data (coding)
3951      struct coding_system *coding;
3952 {
3953   struct composition_data *cmp_data = coding->cmp_data, *next;
3954
3955   if (!cmp_data)
3956     return;
3957   /* Memory blocks are chained.  At first, rewind to the first, then,
3958      free blocks one by one.  */
3959   while (cmp_data->prev)
3960     cmp_data = cmp_data->prev;
3961   while (cmp_data)
3962     {
3963       next = cmp_data->next;
3964       xfree (cmp_data);
3965       cmp_data = next;
3966     }
3967   coding->cmp_data = NULL;
3968 }
3969
3970 /* Set `char_offset' member of all memory blocks pointed by
3971    coding->cmp_data to POS.  */
3972
3973 void
3974 coding_adjust_composition_offset (coding, pos)
3975      struct coding_system *coding;
3976      int pos;
3977 {
3978   struct composition_data *cmp_data;
3979
3980   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3981     cmp_data->char_offset = pos;
3982 }
3983
3984 /* Setup raw-text or one of its subsidiaries in the structure
3985    coding_system CODING according to the already setup value eol_type
3986    in CODING.  CODING should be setup for some coding system in
3987    advance.  */
3988
3989 void
3990 setup_raw_text_coding_system (coding)
3991      struct coding_system *coding;
3992 {
3993   if (coding->type != coding_type_raw_text)
3994     {
3995       coding->symbol = Qraw_text;
3996       coding->type = coding_type_raw_text;
3997       if (coding->eol_type != CODING_EOL_UNDECIDED)
3998         {
3999           Lisp_Object subsidiaries;
4000           subsidiaries = Fget (Qraw_text, Qeol_type);
4001
4002           if (VECTORP (subsidiaries)
4003               && XVECTOR (subsidiaries)->size == 3)
4004             coding->symbol
4005               = XVECTOR (subsidiaries)->contents[coding->eol_type];
4006         }
4007       setup_coding_system (coding->symbol, coding);
4008     }
4009   return;
4010 }
4011
4012 /* Emacs has a mechanism to automatically detect a coding system if it
4013    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4014    it's impossible to distinguish some coding systems accurately
4015    because they use the same range of codes.  So, at first, coding
4016    systems are categorized into 7, those are:
4017
4018    o coding-category-emacs-mule
4019
4020         The category for a coding system which has the same code range
4021         as Emacs' internal format.  Assigned the coding-system (Lisp
4022         symbol) `emacs-mule' by default.
4023
4024    o coding-category-sjis
4025
4026         The category for a coding system which has the same code range
4027         as SJIS.  Assigned the coding-system (Lisp
4028         symbol) `japanese-shift-jis' by default.
4029
4030    o coding-category-iso-7
4031
4032         The category for a coding system which has the same code range
4033         as ISO2022 of 7-bit environment.  This doesn't use any locking
4034         shift and single shift functions.  This can encode/decode all
4035         charsets.  Assigned the coding-system (Lisp symbol)
4036         `iso-2022-7bit' by default.
4037
4038    o coding-category-iso-7-tight
4039
4040         Same as coding-category-iso-7 except that this can
4041         encode/decode only the specified charsets.
4042
4043    o coding-category-iso-8-1
4044
4045         The category for a coding system which has the same code range
4046         as ISO2022 of 8-bit environment and graphic plane 1 used only
4047         for DIMENSION1 charset.  This doesn't use any locking shift
4048         and single shift functions.  Assigned the coding-system (Lisp
4049         symbol) `iso-latin-1' by default.
4050
4051    o coding-category-iso-8-2
4052
4053         The category for a coding system which has the same code range
4054         as ISO2022 of 8-bit environment and graphic plane 1 used only
4055         for DIMENSION2 charset.  This doesn't use any locking shift
4056         and single shift functions.  Assigned the coding-system (Lisp
4057         symbol) `japanese-iso-8bit' by default.
4058
4059    o coding-category-iso-7-else
4060
4061         The category for a coding system which has the same code range
4062         as ISO2022 of 7-bit environment but uses locking shift or
4063         single shift functions.  Assigned the coding-system (Lisp
4064         symbol) `iso-2022-7bit-lock' by default.
4065
4066    o coding-category-iso-8-else
4067
4068         The category for a coding system which has the same code range
4069         as ISO2022 of 8-bit environment but uses locking shift or
4070         single shift functions.  Assigned the coding-system (Lisp
4071         symbol) `iso-2022-8bit-ss2' by default.
4072
4073    o coding-category-big5
4074
4075         The category for a coding system which has the same code range
4076         as BIG5.  Assigned the coding-system (Lisp symbol)
4077         `cn-big5' by default.
4078
4079    o coding-category-utf-8
4080
4081         The category for a coding system which has the same code range
4082         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4083         symbol) `utf-8' by default.
4084
4085    o coding-category-utf-16-be
4086
4087         The category for a coding system in which a text has an
4088         Unicode signature (cf. Unicode Standard) in the order of BIG
4089         endian at the head.  Assigned the coding-system (Lisp symbol)
4090         `utf-16-be' by default.
4091
4092    o coding-category-utf-16-le
4093
4094         The category for a coding system in which a text has an
4095         Unicode signature (cf. Unicode Standard) in the order of
4096         LITTLE endian at the head.  Assigned the coding-system (Lisp
4097         symbol) `utf-16-le' by default.
4098
4099    o coding-category-ccl
4100
4101         The category for a coding system of which encoder/decoder is
4102         written in CCL programs.  The default value is nil, i.e., no
4103         coding system is assigned.
4104
4105    o coding-category-binary
4106
4107         The category for a coding system not categorized in any of the
4108         above.  Assigned the coding-system (Lisp symbol)
4109         `no-conversion' by default.
4110
4111    Each of them is a Lisp symbol and the value is an actual
4112    `coding-system' (this is also a Lisp symbol) assigned by a user.
4113    What Emacs does actually is to detect a category of coding system.
4114    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4115    decide a single possible category, it selects a category of the
4116    highest priority.  Priorities of categories are also specified by a
4117    user in a Lisp variable `coding-category-list'.
4118
4119 */
4120
4121 static
4122 int ascii_skip_code[256];
4123
4124 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4125    If it detects possible coding systems, return an integer in which
4126    appropriate flag bits are set.  Flag bits are defined by macros
4127    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4128    it should point the table `coding_priorities'.  In that case, only
4129    the flag bit for a coding system of the highest priority is set in
4130    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4131    range 0x80..0x9F are in multibyte form.
4132
4133    How many ASCII characters are at the head is returned as *SKIP.  */
4134
4135 static int
4136 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4137      unsigned char *source;
4138      int src_bytes, *priorities, *skip;
4139      int multibytep;
4140 {
4141   register unsigned char c;
4142   unsigned char *src = source, *src_end = source + src_bytes;
4143   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4144   int i;
4145   int null_byte_found;
4146   int latin_extra_code_state = 1;
4147
4148   /* At first, skip all ASCII characters and control characters except
4149      for three ISO2022 specific control characters.  */
4150   ascii_skip_code[ISO_CODE_SO] = 0;
4151   ascii_skip_code[ISO_CODE_SI] = 0;
4152   ascii_skip_code[ISO_CODE_ESC] = 0;
4153
4154  label_loop_detect_coding:
4155   null_byte_found = 0;
4156   /* We stop this loop before the last byte because it may be a NULL
4157      anchor byte.  */
4158   while (src < src_end - 1 && ascii_skip_code[*src])
4159     null_byte_found |= (! *src++);
4160   if (ascii_skip_code[*src])
4161     src++;
4162   else if (! null_byte_found)
4163     {
4164       unsigned char *p = src + 1;
4165       while (p < src_end - 1)
4166         null_byte_found |= (! *p++);
4167     }
4168   *skip = src - source;
4169
4170   if (src >= src_end)
4171     /* We found nothing other than ASCII (and NULL byte).  There's
4172        nothing to do.  */
4173     return 0;
4174
4175   c = *src;
4176   /* The text seems to be encoded in some multilingual coding system.
4177      Now, try to find in which coding system the text is encoded.  */
4178   if (! null_byte_found && c < 0x80)
4179     {
4180       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4181       /* C is an ISO2022 specific control code of C0.  */
4182       latin_extra_code_state = 1;
4183       mask = detect_coding_iso2022 (src, src_end, multibytep,
4184                                     &latin_extra_code_state);
4185       if (mask == 0)
4186         {
4187           /* No valid ISO2022 code follows C.  Try again.  */
4188           src++;
4189           if (c == ISO_CODE_ESC)
4190             ascii_skip_code[ISO_CODE_ESC] = 1;
4191           else
4192             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4193           goto label_loop_detect_coding;
4194         }
4195       if (priorities)
4196         {
4197           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4198             {
4199               if (mask & priorities[i])
4200                 return priorities[i];
4201             }
4202           return CODING_CATEGORY_MASK_RAW_TEXT;
4203         }
4204     }
4205   else
4206     {
4207       int try;
4208
4209       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4210         c = src[1] - 0x20;
4211
4212       if (null_byte_found)
4213         {
4214           try = (CODING_CATEGORY_MASK_UTF_16_BE
4215                  | CODING_CATEGORY_MASK_UTF_16_LE);
4216         }
4217       else if (c < 0xA0)
4218         {
4219           /* C is the first byte of SJIS character code,
4220              or a leading-code of Emacs' internal format (emacs-mule),
4221              or the first byte of UTF-16.  */
4222           try = (CODING_CATEGORY_MASK_SJIS
4223                  | CODING_CATEGORY_MASK_EMACS_MULE
4224                  | CODING_CATEGORY_MASK_UTF_16_BE
4225                  | CODING_CATEGORY_MASK_UTF_16_LE);
4226
4227           /* Or, if C is a special latin extra code,
4228              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4229              or is an ISO2022 control-sequence-introducer (CSI),
4230              we should also consider the possibility of ISO2022 codings.  */
4231           if ((latin_extra_code_state
4232                && VECTORP (Vlatin_extra_code_table)
4233                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4234               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4235               || (c == ISO_CODE_CSI
4236                   && (src < src_end
4237                       && (*src == ']'
4238                           || ((*src == '0' || *src == '1' || *src == '2')
4239                               && src + 1 < src_end
4240                               && src[1] == ']')))))
4241             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4242                     | CODING_CATEGORY_MASK_ISO_8BIT);
4243         }
4244       else
4245         /* C is a character of ISO2022 in graphic plane right,
4246            or a SJIS's 1-byte character code (i.e. JISX0201),
4247            or the first byte of BIG5's 2-byte code,
4248            or the first byte of UTF-8/16.  */
4249         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4250                | CODING_CATEGORY_MASK_ISO_8BIT
4251                | CODING_CATEGORY_MASK_SJIS
4252                | CODING_CATEGORY_MASK_BIG5
4253                | CODING_CATEGORY_MASK_UTF_8
4254                | CODING_CATEGORY_MASK_UTF_16_BE
4255                | CODING_CATEGORY_MASK_UTF_16_LE);
4256
4257       /* Or, we may have to consider the possibility of CCL.  */
4258       if (! null_byte_found
4259           && coding_system_table[CODING_CATEGORY_IDX_CCL]
4260           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4261               ->spec.ccl.valid_codes)[c])
4262         try |= CODING_CATEGORY_MASK_CCL;
4263
4264       mask = 0;
4265       if (priorities)
4266         {
4267           /* At first try detection with Latin extra codes not-allowed.
4268              If no proper coding system is found because of Latin extra
4269              codes, try detection with Latin extra codes allowed.  */
4270           latin_extra_code_state = 0;
4271         label_retry:
4272           utf16_examined_p = iso2022_examined_p = 0;
4273           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4274             {
4275               if (!iso2022_examined_p
4276                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4277                 {
4278                   mask |= detect_coding_iso2022 (src, src_end, multibytep,
4279                                                  &latin_extra_code_state);
4280                   iso2022_examined_p = 1;
4281                 }
4282               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4283                 mask |= detect_coding_sjis (src, src_end, multibytep);
4284               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4285                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4286               else if (!utf16_examined_p
4287                        && (priorities[i] & try &
4288                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4289                 {
4290                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4291                   utf16_examined_p = 1;
4292                 }
4293               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4294                 mask |= detect_coding_big5 (src, src_end, multibytep);
4295               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4296                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4297               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4298                 mask |= detect_coding_ccl (src, src_end, multibytep);
4299               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4300                 {
4301                   if (latin_extra_code_state == 1)
4302                     {
4303                       /* Detection of ISO-2022 based coding system
4304                          failed because of Latin extra codes.  Before
4305                          falling back to raw-text, try again with
4306                          Latin extra codes allowed.  */
4307                       latin_extra_code_state = 2;
4308                       try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4309                              | CODING_CATEGORY_MASK_ISO_8BIT);
4310                       goto label_retry;
4311                     }
4312                   mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4313                 }
4314               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4315                 {
4316                   if (latin_extra_code_state == 1)
4317                     {
4318                       /* See the above comment.  */
4319                       latin_extra_code_state = 2;
4320                       try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4321                              | CODING_CATEGORY_MASK_ISO_8BIT);
4322                       goto label_retry;
4323                     }
4324                   mask |= CODING_CATEGORY_MASK_BINARY;
4325                 }
4326               if (mask & priorities[i])
4327                 return priorities[i];
4328             }
4329           return CODING_CATEGORY_MASK_RAW_TEXT;
4330         }
4331       if (try & CODING_CATEGORY_MASK_ISO)
4332         mask |= detect_coding_iso2022 (src, src_end, multibytep,
4333                                        &latin_extra_code_state);
4334       if (try & CODING_CATEGORY_MASK_SJIS)
4335         mask |= detect_coding_sjis (src, src_end, multibytep);
4336       if (try & CODING_CATEGORY_MASK_BIG5)
4337         mask |= detect_coding_big5 (src, src_end, multibytep);
4338       if (try & CODING_CATEGORY_MASK_UTF_8)
4339         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4340       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4341         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4342       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4343         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4344       if (try & CODING_CATEGORY_MASK_CCL)
4345         mask |= detect_coding_ccl (src, src_end, multibytep);
4346     }
4347   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4348 }
4349
4350 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4351    The information of the detected coding system is set in CODING.  */
4352
4353 void
4354 detect_coding (coding, src, src_bytes)
4355      struct coding_system *coding;
4356      const unsigned char *src;
4357      int src_bytes;
4358 {
4359   unsigned int idx;
4360   int skip, mask;
4361   Lisp_Object val;
4362
4363   val = Vcoding_category_list;
4364   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4365                              coding->src_multibyte);
4366   coding->heading_ascii = skip;
4367
4368   if (!mask) return;
4369
4370   /* We found a single coding system of the highest priority in MASK.  */
4371   idx = 0;
4372   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4373   if (! mask)
4374     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4375
4376   val = find_symbol_value (XVECTOR (Vcoding_category_table)->contents[idx]);
4377
4378   if (coding->eol_type != CODING_EOL_UNDECIDED)
4379     {
4380       Lisp_Object tmp;
4381
4382       tmp = Fget (val, Qeol_type);
4383       if (VECTORP (tmp))
4384         val = XVECTOR (tmp)->contents[coding->eol_type];
4385     }
4386
4387   /* Setup this new coding system while preserving some slots.  */
4388   {
4389     int src_multibyte = coding->src_multibyte;
4390     int dst_multibyte = coding->dst_multibyte;
4391
4392     setup_coding_system (val, coding);
4393     coding->src_multibyte = src_multibyte;
4394     coding->dst_multibyte = dst_multibyte;
4395     coding->heading_ascii = skip;
4396   }
4397 }
4398
4399 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4400    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4401    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4402
4403    How many non-eol characters are at the head is returned as *SKIP.  */
4404
4405 #define MAX_EOL_CHECK_COUNT 3
4406
4407 static int
4408 detect_eol_type (source, src_bytes, skip)
4409      const unsigned char *source;
4410      int src_bytes, *skip;
4411 {
4412   const unsigned char *src = source, *src_end = src + src_bytes;
4413   unsigned char c;
4414   int total = 0;                /* How many end-of-lines are found so far.  */
4415   int eol_type = CODING_EOL_UNDECIDED;
4416   int this_eol_type;
4417
4418   *skip = 0;
4419
4420   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4421     {
4422       c = *src++;
4423       if (c == '\n' || c == '\r')
4424         {
4425           if (*skip == 0)
4426             *skip = src - 1 - source;
4427           total++;
4428           if (c == '\n')
4429             this_eol_type = CODING_EOL_LF;
4430           else if (src >= src_end || *src != '\n')
4431             this_eol_type = CODING_EOL_CR;
4432           else
4433             this_eol_type = CODING_EOL_CRLF, src++;
4434
4435           if (eol_type == CODING_EOL_UNDECIDED)
4436             /* This is the first end-of-line.  */
4437             eol_type = this_eol_type;
4438           else if (eol_type != this_eol_type)
4439             {
4440               /* The found type is different from what found before.  */
4441               eol_type = CODING_EOL_INCONSISTENT;
4442               break;
4443             }
4444         }
4445     }
4446
4447   if (*skip == 0)
4448     *skip = src_end - source;
4449   return eol_type;
4450 }
4451
4452 /* Like detect_eol_type, but detect EOL type in 2-octet
4453    big-endian/little-endian format for coding systems utf-16-be and
4454    utf-16-le.  */
4455
4456 static int
4457 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4458      const unsigned char *source;
4459      int src_bytes, *skip, big_endian_p;
4460 {
4461   const unsigned char *src = source, *src_end = src + src_bytes;
4462   unsigned int c1, c2;
4463   int total = 0;                /* How many end-of-lines are found so far.  */
4464   int eol_type = CODING_EOL_UNDECIDED;
4465   int this_eol_type;
4466   int msb, lsb;
4467
4468   if (big_endian_p)
4469     msb = 0, lsb = 1;
4470   else
4471     msb = 1, lsb = 0;
4472
4473   *skip = 0;
4474
4475   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4476     {
4477       c1 = (src[msb] << 8) | (src[lsb]);
4478       src += 2;
4479
4480       if (c1 == '\n' || c1 == '\r')
4481         {
4482           if (*skip == 0)
4483             *skip = src - 2 - source;
4484           total++;
4485           if (c1 == '\n')
4486             {
4487               this_eol_type = CODING_EOL_LF;
4488             }
4489           else
4490             {
4491               if ((src + 1) >= src_end)
4492                 {
4493                   this_eol_type = CODING_EOL_CR;
4494                 }
4495               else
4496                 {
4497                   c2 = (src[msb] << 8) | (src[lsb]);
4498                   if (c2 == '\n')
4499                     this_eol_type = CODING_EOL_CRLF, src += 2;
4500                   else
4501                     this_eol_type = CODING_EOL_CR;
4502                 }
4503             }
4504
4505           if (eol_type == CODING_EOL_UNDECIDED)
4506             /* This is the first end-of-line.  */
4507             eol_type = this_eol_type;
4508           else if (eol_type != this_eol_type)
4509             {
4510               /* The found type is different from what found before.  */
4511               eol_type = CODING_EOL_INCONSISTENT;
4512               break;
4513             }
4514         }
4515     }
4516
4517   if (*skip == 0)
4518     *skip = src_end - source;
4519   return eol_type;
4520 }
4521
4522 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4523    is encoded.  If it detects an appropriate format of end-of-line, it
4524    sets the information in *CODING.  */
4525
4526 void
4527 detect_eol (coding, src, src_bytes)
4528      struct coding_system *coding;
4529      const unsigned char *src;
4530      int src_bytes;
4531 {
4532   Lisp_Object val;
4533   int skip;
4534   int eol_type;
4535
4536   switch (coding->category_idx)
4537     {
4538     case CODING_CATEGORY_IDX_UTF_16_BE:
4539       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4540       break;
4541     case CODING_CATEGORY_IDX_UTF_16_LE:
4542       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4543       break;
4544     default:
4545       eol_type = detect_eol_type (src, src_bytes, &skip);
4546       break;
4547     }
4548
4549   if (coding->heading_ascii > skip)
4550     coding->heading_ascii = skip;
4551   else
4552     skip = coding->heading_ascii;
4553
4554   if (eol_type == CODING_EOL_UNDECIDED)
4555     return;
4556   if (eol_type == CODING_EOL_INCONSISTENT)
4557     {
4558 #if 0
4559       /* This code is suppressed until we find a better way to
4560          distinguish raw text file and binary file.  */
4561
4562       /* If we have already detected that the coding is raw-text, the
4563          coding should actually be no-conversion.  */
4564       if (coding->type == coding_type_raw_text)
4565         {
4566           setup_coding_system (Qno_conversion, coding);
4567           return;
4568         }
4569       /* Else, let's decode only text code anyway.  */
4570 #endif /* 0 */
4571       eol_type = CODING_EOL_LF;
4572     }
4573
4574   val = Fget (coding->symbol, Qeol_type);
4575   if (VECTORP (val) && XVECTOR (val)->size == 3)
4576     {
4577       int src_multibyte = coding->src_multibyte;
4578       int dst_multibyte = coding->dst_multibyte;
4579       struct composition_data *cmp_data = coding->cmp_data;
4580
4581       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4582       coding->src_multibyte = src_multibyte;
4583       coding->dst_multibyte = dst_multibyte;
4584       coding->heading_ascii = skip;
4585       coding->cmp_data = cmp_data;
4586     }
4587 }
4588
4589 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4590
4591 #define DECODING_BUFFER_MAG(coding)                     \
4592   (coding->type == coding_type_iso2022                  \
4593    ? 3                                                  \
4594    : (coding->type == coding_type_ccl                   \
4595       ? coding->spec.ccl.decoder.buf_magnification      \
4596       : 2))
4597
4598 /* Return maximum size (bytes) of a buffer enough for decoding
4599    SRC_BYTES of text encoded in CODING.  */
4600
4601 int
4602 decoding_buffer_size (coding, src_bytes)
4603      struct coding_system *coding;
4604      int src_bytes;
4605 {
4606   return (src_bytes * DECODING_BUFFER_MAG (coding)
4607           + CONVERSION_BUFFER_EXTRA_ROOM);
4608 }
4609
4610 /* Return maximum size (bytes) of a buffer enough for encoding
4611    SRC_BYTES of text to CODING.  */
4612
4613 int
4614 encoding_buffer_size (coding, src_bytes)
4615      struct coding_system *coding;
4616      int src_bytes;
4617 {
4618   int magnification;
4619
4620   if (coding->type == coding_type_ccl)
4621     {
4622       magnification = coding->spec.ccl.encoder.buf_magnification;
4623       if (coding->eol_type == CODING_EOL_CRLF)
4624         magnification *= 2;
4625     }
4626   else if (CODING_REQUIRE_ENCODING (coding))
4627     magnification = 3;
4628   else
4629     magnification = 1;
4630
4631   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4632 }
4633
4634 /* Working buffer for code conversion.  */
4635 struct conversion_buffer
4636 {
4637   int size;                     /* size of data.  */
4638   int on_stack;                 /* 1 if allocated by alloca.  */
4639   unsigned char *data;
4640 };
4641
4642 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4643 #define allocate_conversion_buffer(buf, len)            \
4644   do {                                                  \
4645     if (len < MAX_ALLOCA)                               \
4646       {                                                 \
4647         buf.data = (unsigned char *) alloca (len);      \
4648         buf.on_stack = 1;                               \
4649       }                                                 \
4650     else                                                \
4651       {                                                 \
4652         buf.data = (unsigned char *) xmalloc (len);     \
4653         buf.on_stack = 0;                               \
4654       }                                                 \
4655     buf.size = len;                                     \
4656   } while (0)
4657
4658 /* Double the allocated memory for *BUF.  */
4659 static void
4660 extend_conversion_buffer (buf)
4661      struct conversion_buffer *buf;
4662 {
4663   if (buf->on_stack)
4664     {
4665       unsigned char *save = buf->data;
4666       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4667       bcopy (save, buf->data, buf->size);
4668       buf->on_stack = 0;
4669     }
4670   else
4671     {
4672       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4673     }
4674   buf->size *= 2;
4675 }
4676
4677 /* Free the allocated memory for BUF if it is not on stack.  */
4678 static void
4679 free_conversion_buffer (buf)
4680      struct conversion_buffer *buf;
4681 {
4682   if (!buf->on_stack)
4683     xfree (buf->data);
4684 }
4685
4686 int
4687 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4688      struct coding_system *coding;
4689      unsigned char *source, *destination;
4690      int src_bytes, dst_bytes, encodep;
4691 {
4692   struct ccl_program *ccl
4693     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4694   unsigned char *dst = destination;
4695
4696   ccl->suppress_error = coding->suppress_error;
4697   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4698   if (encodep)
4699     {
4700       /* On encoding, EOL format is converted within ccl_driver.  For
4701          that, setup proper information in the structure CCL.  */
4702       ccl->eol_type = coding->eol_type;
4703       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4704         ccl->eol_type = CODING_EOL_LF;
4705       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4706       ccl->eight_bit_control = coding->dst_multibyte;
4707     }
4708   else
4709     ccl->eight_bit_control = 1;
4710   ccl->multibyte = coding->src_multibyte;
4711   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4712     {
4713       /* Move carryover bytes to DESTINATION.  */
4714       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4715       while (*p)
4716         *dst++ = *p++;
4717       coding->spec.ccl.eight_bit_carryover[0] = 0;
4718       if (dst_bytes)
4719         dst_bytes -= dst - destination;
4720     }
4721
4722   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4723                                   &(coding->consumed))
4724                       + dst - destination);
4725
4726   if (encodep)
4727     {
4728       coding->produced_char = coding->produced;
4729       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4730     }
4731   else if (!ccl->eight_bit_control)
4732     {
4733       /* The produced bytes forms a valid multibyte sequence. */
4734       coding->produced_char
4735         = multibyte_chars_in_text (destination, coding->produced);
4736       coding->spec.ccl.eight_bit_carryover[0] = 0;
4737     }
4738   else
4739     {
4740       /* On decoding, the destination should always multibyte.  But,
4741          CCL program might have been generated an invalid multibyte
4742          sequence.  Here we make such a sequence valid as
4743          multibyte.  */
4744       int bytes
4745         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4746
4747       if ((coding->consumed < src_bytes
4748            || !ccl->last_block)
4749           && coding->produced >= 1
4750           && destination[coding->produced - 1] >= 0x80)
4751         {
4752           /* We should not convert the tailing 8-bit codes to
4753              multibyte form even if they doesn't form a valid
4754              multibyte sequence.  They may form a valid sequence in
4755              the next call.  */
4756           int carryover = 0;
4757
4758           if (destination[coding->produced - 1] < 0xA0)
4759             carryover = 1;
4760           else if (coding->produced >= 2)
4761             {
4762               if (destination[coding->produced - 2] >= 0x80)
4763                 {
4764                   if (destination[coding->produced - 2] < 0xA0)
4765                     carryover = 2;
4766                   else if (coding->produced >= 3
4767                            && destination[coding->produced - 3] >= 0x80
4768                            && destination[coding->produced - 3] < 0xA0)
4769                     carryover = 3;
4770                 }
4771             }
4772           if (carryover > 0)
4773             {
4774               BCOPY_SHORT (destination + coding->produced - carryover,
4775                            coding->spec.ccl.eight_bit_carryover,
4776                            carryover);
4777               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4778               coding->produced -= carryover;
4779             }
4780         }
4781       coding->produced = str_as_multibyte (destination, bytes,
4782                                            coding->produced,
4783                                            &(coding->produced_char));
4784     }
4785
4786   switch (ccl->status)
4787     {
4788     case CCL_STAT_SUSPEND_BY_SRC:
4789       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4790       break;
4791     case CCL_STAT_SUSPEND_BY_DST:
4792       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4793       break;
4794     case CCL_STAT_QUIT:
4795     case CCL_STAT_INVALID_CMD:
4796       coding->result = CODING_FINISH_INTERRUPT;
4797       break;
4798     default:
4799       coding->result = CODING_FINISH_NORMAL;
4800       break;
4801     }
4802   return coding->result;
4803 }
4804
4805 /* Decode EOL format of the text at PTR of BYTES length destructively
4806    according to CODING->eol_type.  This is called after the CCL
4807    program produced a decoded text at PTR.  If we do CRLF->LF
4808    conversion, update CODING->produced and CODING->produced_char.  */
4809
4810 static void
4811 decode_eol_post_ccl (coding, ptr, bytes)
4812      struct coding_system *coding;
4813      unsigned char *ptr;
4814      int bytes;
4815 {
4816   Lisp_Object val, saved_coding_symbol;
4817   unsigned char *pend = ptr + bytes;
4818   int dummy;
4819
4820   /* Remember the current coding system symbol.  We set it back when
4821      an inconsistent EOL is found so that `last-coding-system-used' is
4822      set to the coding system that doesn't specify EOL conversion.  */
4823   saved_coding_symbol = coding->symbol;
4824
4825   coding->spec.ccl.cr_carryover = 0;
4826   if (coding->eol_type == CODING_EOL_UNDECIDED)
4827     {
4828       /* Here, to avoid the call of setup_coding_system, we directly
4829          call detect_eol_type.  */
4830       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4831       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4832         coding->eol_type = CODING_EOL_LF;
4833       if (coding->eol_type != CODING_EOL_UNDECIDED)
4834         {
4835           val = Fget (coding->symbol, Qeol_type);
4836           if (VECTORP (val) && XVECTOR (val)->size == 3)
4837             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4838         }
4839       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4840     }
4841
4842   if (coding->eol_type == CODING_EOL_LF
4843       || coding->eol_type == CODING_EOL_UNDECIDED)
4844     {
4845       /* We have nothing to do.  */
4846       ptr = pend;
4847     }
4848   else if (coding->eol_type == CODING_EOL_CRLF)
4849     {
4850       unsigned char *pstart = ptr, *p = ptr;
4851
4852       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4853           && *(pend - 1) == '\r')
4854         {
4855           /* If the last character is CR, we can't handle it here
4856              because LF will be in the not-yet-decoded source text.
4857              Record that the CR is not yet processed.  */
4858           coding->spec.ccl.cr_carryover = 1;
4859           coding->produced--;
4860           coding->produced_char--;
4861           pend--;
4862         }
4863       while (ptr < pend)
4864         {
4865           if (*ptr == '\r')
4866             {
4867               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4868                 {
4869                   *p++ = '\n';
4870                   ptr += 2;
4871                 }
4872               else
4873                 {
4874                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4875                     goto undo_eol_conversion;
4876                   *p++ = *ptr++;
4877                 }
4878             }
4879           else if (*ptr == '\n'
4880                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4881             goto undo_eol_conversion;
4882           else
4883             *p++ = *ptr++;
4884           continue;
4885
4886         undo_eol_conversion:
4887           /* We have faced with inconsistent EOL format at PTR.
4888              Convert all LFs before PTR back to CRLFs.  */
4889           for (p--, ptr--; p >= pstart; p--)
4890             {
4891               if (*p == '\n')
4892                 *ptr-- = '\n', *ptr-- = '\r';
4893               else
4894                 *ptr-- = *p;
4895             }
4896           /*  If carryover is recorded, cancel it because we don't
4897               convert CRLF anymore.  */
4898           if (coding->spec.ccl.cr_carryover)
4899             {
4900               coding->spec.ccl.cr_carryover = 0;
4901               coding->produced++;
4902               coding->produced_char++;
4903               pend++;
4904             }
4905           p = ptr = pend;
4906           coding->eol_type = CODING_EOL_LF;
4907           coding->symbol = saved_coding_symbol;
4908         }
4909       if (p < pend)
4910         {
4911           /* As each two-byte sequence CRLF was converted to LF, (PEND
4912              - P) is the number of deleted characters.  */
4913           coding->produced -= pend - p;
4914           coding->produced_char -= pend - p;
4915         }
4916     }
4917   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4918     {
4919       unsigned char *p = ptr;
4920
4921       for (; ptr < pend; ptr++)
4922         {
4923           if (*ptr == '\r')
4924             *ptr = '\n';
4925           else if (*ptr == '\n'
4926                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4927             {
4928               for (; p < ptr; p++)
4929                 {
4930                   if (*p == '\n')
4931                     *p = '\r';
4932                 }
4933               ptr = pend;
4934               coding->eol_type = CODING_EOL_LF;
4935               coding->symbol = saved_coding_symbol;
4936             }
4937         }
4938     }
4939 }
4940
4941 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4942    decoding, it may detect coding system and format of end-of-line if
4943    those are not yet decided.  The source should be unibyte, the
4944    result is multibyte if CODING->dst_multibyte is nonzero, else
4945    unibyte.  */
4946
4947 int
4948 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4949      struct coding_system *coding;
4950      const unsigned char *source;
4951      unsigned char *destination;
4952      int src_bytes, dst_bytes;
4953 {
4954   int extra = 0;
4955
4956   if (coding->type == coding_type_undecided)
4957     detect_coding (coding, source, src_bytes);
4958
4959   if (coding->eol_type == CODING_EOL_UNDECIDED
4960       && coding->type != coding_type_ccl)
4961     {
4962       detect_eol (coding, source, src_bytes);
4963       /* We had better recover the original eol format if we
4964          encounter an inconsistent eol format while decoding.  */
4965       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4966     }
4967
4968   coding->produced = coding->produced_char = 0;
4969   coding->consumed = coding->consumed_char = 0;
4970   coding->errors = 0;
4971   coding->result = CODING_FINISH_NORMAL;
4972
4973   switch (coding->type)
4974     {
4975     case coding_type_sjis:
4976       decode_coding_sjis_big5 (coding, source, destination,
4977                                src_bytes, dst_bytes, 1);
4978       break;
4979
4980     case coding_type_iso2022:
4981       decode_coding_iso2022 (coding, source, destination,
4982                              src_bytes, dst_bytes);
4983       break;
4984
4985     case coding_type_big5:
4986       decode_coding_sjis_big5 (coding, source, destination,
4987                                src_bytes, dst_bytes, 0);
4988       break;
4989
4990     case coding_type_emacs_mule:
4991       decode_coding_emacs_mule (coding, source, destination,
4992                                 src_bytes, dst_bytes);
4993       break;
4994
4995     case coding_type_ccl:
4996       if (coding->spec.ccl.cr_carryover)
4997         {
4998           /* Put the CR which was not processed by the previous call
4999              of decode_eol_post_ccl in DESTINATION.  It will be
5000              decoded together with the following LF by the call to
5001              decode_eol_post_ccl below.  */
5002           *destination = '\r';
5003           coding->produced++;
5004           coding->produced_char++;
5005           dst_bytes--;
5006           extra = coding->spec.ccl.cr_carryover;
5007         }
5008       ccl_coding_driver (coding, source, destination + extra,
5009                          src_bytes, dst_bytes, 0);
5010       if (coding->eol_type != CODING_EOL_LF)
5011         {
5012           coding->produced += extra;
5013           coding->produced_char += extra;
5014           decode_eol_post_ccl (coding, destination, coding->produced);
5015         }
5016       break;
5017
5018     default:
5019       decode_eol (coding, source, destination, src_bytes, dst_bytes);
5020     }
5021
5022   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5023       && coding->mode & CODING_MODE_LAST_BLOCK
5024       && coding->consumed == src_bytes)
5025     coding->result = CODING_FINISH_NORMAL;
5026
5027   if (coding->mode & CODING_MODE_LAST_BLOCK
5028       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5029     {
5030       const unsigned char *src = source + coding->consumed;
5031       unsigned char *dst = destination + coding->produced;
5032
5033       src_bytes -= coding->consumed;
5034       coding->errors++;
5035       if (COMPOSING_P (coding))
5036         DECODE_COMPOSITION_END ('1');
5037       while (src_bytes--)
5038         {
5039           int c = *src++;
5040           dst += CHAR_STRING (c, dst);
5041           coding->produced_char++;
5042         }
5043       coding->consumed = coding->consumed_char = src - source;
5044       coding->produced = dst - destination;
5045       coding->result = CODING_FINISH_NORMAL;
5046     }
5047
5048   if (!coding->dst_multibyte)
5049     {
5050       coding->produced = str_as_unibyte (destination, coding->produced);
5051       coding->produced_char = coding->produced;
5052     }
5053
5054   return coding->result;
5055 }
5056
5057 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
5058    multibyteness of the source is CODING->src_multibyte, the
5059    multibyteness of the result is always unibyte.  */
5060
5061 int
5062 encode_coding (coding, source, destination, src_bytes, dst_bytes)
5063      struct coding_system *coding;
5064      const unsigned char *source;
5065      unsigned char *destination;
5066      int src_bytes, dst_bytes;
5067 {
5068   coding->produced = coding->produced_char = 0;
5069   coding->consumed = coding->consumed_char = 0;
5070   coding->errors = 0;
5071   coding->result = CODING_FINISH_NORMAL;
5072   if (coding->eol_type == CODING_EOL_UNDECIDED)
5073     coding->eol_type = CODING_EOL_LF;
5074
5075   switch (coding->type)
5076     {
5077     case coding_type_sjis:
5078       encode_coding_sjis_big5 (coding, source, destination,
5079                                src_bytes, dst_bytes, 1);
5080       break;
5081
5082     case coding_type_iso2022:
5083       encode_coding_iso2022 (coding, source, destination,
5084                              src_bytes, dst_bytes);
5085       break;
5086
5087     case coding_type_big5:
5088       encode_coding_sjis_big5 (coding, source, destination,
5089                                src_bytes, dst_bytes, 0);
5090       break;
5091
5092     case coding_type_emacs_mule:
5093       encode_coding_emacs_mule (coding, source, destination,
5094                                 src_bytes, dst_bytes);
5095       break;
5096
5097     case coding_type_ccl:
5098       ccl_coding_driver (coding, source, destination,
5099                          src_bytes, dst_bytes, 1);
5100       break;
5101
5102     default:
5103       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5104     }
5105
5106   if (coding->mode & CODING_MODE_LAST_BLOCK
5107       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5108     {
5109       const unsigned char *src = source + coding->consumed;
5110       unsigned char *dst = destination + coding->produced;
5111
5112       if (coding->type == coding_type_iso2022)
5113         ENCODE_RESET_PLANE_AND_REGISTER;
5114       if (COMPOSING_P (coding))
5115         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5116       if (coding->consumed < src_bytes)
5117         {
5118           int len = src_bytes - coding->consumed;
5119
5120           BCOPY_SHORT (src, dst, len);
5121           if (coding->src_multibyte)
5122             len = str_as_unibyte (dst, len);
5123           dst += len;
5124           coding->consumed = src_bytes;
5125         }
5126       coding->produced = coding->produced_char = dst - destination;
5127       coding->result = CODING_FINISH_NORMAL;
5128     }
5129
5130   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5131       && coding->consumed == src_bytes)
5132     coding->result = CODING_FINISH_NORMAL;
5133
5134   return coding->result;
5135 }
5136
5137 /* Scan text in the region between *BEG and *END (byte positions),
5138    skip characters which we don't have to decode by coding system
5139    CODING at the head and tail, then set *BEG and *END to the region
5140    of the text we actually have to convert.  The caller should move
5141    the gap out of the region in advance if the region is from a
5142    buffer.
5143
5144    If STR is not NULL, *BEG and *END are indices into STR.  */
5145
5146 static void
5147 shrink_decoding_region (beg, end, coding, str)
5148      int *beg, *end;
5149      struct coding_system *coding;
5150      unsigned char *str;
5151 {
5152   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5153   int eol_conversion;
5154   Lisp_Object translation_table;
5155
5156   if (coding->type == coding_type_ccl
5157       || coding->type == coding_type_undecided
5158       || coding->eol_type != CODING_EOL_LF
5159       || !NILP (coding->post_read_conversion)
5160       || coding->composing != COMPOSITION_DISABLED)
5161     {
5162       /* We can't skip any data.  */
5163       return;
5164     }
5165   if (coding->type == coding_type_no_conversion
5166       || coding->type == coding_type_raw_text
5167       || coding->type == coding_type_emacs_mule)
5168     {
5169       /* We need no conversion, but don't have to skip any data here.
5170          Decoding routine handles them effectively anyway.  */
5171       return;
5172     }
5173
5174   translation_table = coding->translation_table_for_decode;
5175   if (NILP (translation_table) && !NILP (Venable_character_translation))
5176     translation_table = Vstandard_translation_table_for_decode;
5177   if (CHAR_TABLE_P (translation_table))
5178     {
5179       int i;
5180       for (i = 0; i < 128; i++)
5181         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5182           break;
5183       if (i < 128)
5184         /* Some ASCII character should be translated.  We give up
5185            shrinking.  */
5186         return;
5187     }
5188
5189   if (coding->heading_ascii >= 0)
5190     /* Detection routine has already found how much we can skip at the
5191        head.  */
5192     *beg += coding->heading_ascii;
5193
5194   if (str)
5195     {
5196       begp_orig = begp = str + *beg;
5197       endp_orig = endp = str + *end;
5198     }
5199   else
5200     {
5201       begp_orig = begp = BYTE_POS_ADDR (*beg);
5202       endp_orig = endp = begp + *end - *beg;
5203     }
5204
5205   eol_conversion = (coding->eol_type == CODING_EOL_CR
5206                     || coding->eol_type == CODING_EOL_CRLF);
5207
5208   switch (coding->type)
5209     {
5210     case coding_type_sjis:
5211     case coding_type_big5:
5212       /* We can skip all ASCII characters at the head.  */
5213       if (coding->heading_ascii < 0)
5214         {
5215           if (eol_conversion)
5216             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5217           else
5218             while (begp < endp && *begp < 0x80) begp++;
5219         }
5220       /* We can skip all ASCII characters at the tail except for the
5221          second byte of SJIS or BIG5 code.  */
5222       if (eol_conversion)
5223         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5224       else
5225         while (begp < endp && endp[-1] < 0x80) endp--;
5226       /* Do not consider LF as ascii if preceded by CR, since that
5227          confuses eol decoding. */
5228       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5229         endp++;
5230       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5231         endp++;
5232       break;
5233
5234     case coding_type_iso2022:
5235       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5236         /* We can't skip any data.  */
5237         break;
5238       if (coding->heading_ascii < 0)
5239         {
5240           /* We can skip all ASCII characters at the head except for a
5241              few control codes.  */
5242           while (begp < endp && (c = *begp) < 0x80
5243                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5244                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5245                  && (!eol_conversion || c != ISO_CODE_LF))
5246             begp++;
5247         }
5248       switch (coding->category_idx)
5249         {
5250         case CODING_CATEGORY_IDX_ISO_8_1:
5251         case CODING_CATEGORY_IDX_ISO_8_2:
5252           /* We can skip all ASCII characters at the tail.  */
5253           if (eol_conversion)
5254             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5255           else
5256             while (begp < endp && endp[-1] < 0x80) endp--;
5257           /* Do not consider LF as ascii if preceded by CR, since that
5258              confuses eol decoding. */
5259           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5260             endp++;
5261           break;
5262
5263         case CODING_CATEGORY_IDX_ISO_7:
5264         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5265           {
5266             /* We can skip all characters at the tail except for 8-bit
5267                codes and ESC and the following 2-byte at the tail.  */
5268             unsigned char *eight_bit = NULL;
5269
5270             if (eol_conversion)
5271               while (begp < endp
5272                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5273                 {
5274                   if (!eight_bit && c & 0x80) eight_bit = endp;
5275                   endp--;
5276                 }
5277             else
5278               while (begp < endp
5279                      && (c = endp[-1]) != ISO_CODE_ESC)
5280                 {
5281                   if (!eight_bit && c & 0x80) eight_bit = endp;
5282                   endp--;
5283                 }
5284             /* Do not consider LF as ascii if preceded by CR, since that
5285                confuses eol decoding. */
5286             if (begp < endp && endp < endp_orig
5287                 && endp[-1] == '\r' && endp[0] == '\n')
5288               endp++;
5289             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5290               {
5291                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5292                   /* This is an ASCII designation sequence.  We can
5293                      surely skip the tail.  But, if we have
5294                      encountered an 8-bit code, skip only the codes
5295                      after that.  */
5296                   endp = eight_bit ? eight_bit : endp + 2;
5297                 else
5298                   /* Hmmm, we can't skip the tail.  */
5299                   endp = endp_orig;
5300               }
5301             else if (eight_bit)
5302               endp = eight_bit;
5303           }
5304         }
5305       break;
5306
5307     default:
5308       abort ();
5309     }
5310   *beg += begp - begp_orig;
5311   *end += endp - endp_orig;
5312   return;
5313 }
5314
5315 /* Like shrink_decoding_region but for encoding.  */
5316
5317 static void
5318 shrink_encoding_region (beg, end, coding, str)
5319      int *beg, *end;
5320      struct coding_system *coding;
5321      unsigned char *str;
5322 {
5323   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5324   int eol_conversion;
5325   Lisp_Object translation_table;
5326
5327   if (coding->type == coding_type_ccl
5328       || coding->eol_type == CODING_EOL_CRLF
5329       || coding->eol_type == CODING_EOL_CR
5330       || (coding->cmp_data && coding->cmp_data->used > 0))
5331     {
5332       /* We can't skip any data.  */
5333       return;
5334     }
5335   if (coding->type == coding_type_no_conversion
5336       || coding->type == coding_type_raw_text
5337       || coding->type == coding_type_emacs_mule
5338       || coding->type == coding_type_undecided)
5339     {
5340       /* We need no conversion, but don't have to skip any data here.
5341          Encoding routine handles them effectively anyway.  */
5342       return;
5343     }
5344
5345   translation_table = coding->translation_table_for_encode;
5346   if (NILP (translation_table) && !NILP (Venable_character_translation))
5347     translation_table = Vstandard_translation_table_for_encode;
5348   if (CHAR_TABLE_P (translation_table))
5349     {
5350       int i;
5351       for (i = 0; i < 128; i++)
5352         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5353           break;
5354       if (i < 128)
5355         /* Some ASCII character should be translated.  We give up
5356            shrinking.  */
5357         return;
5358     }
5359
5360   if (str)
5361     {
5362       begp_orig = begp = str + *beg;
5363       endp_orig = endp = str + *end;
5364     }
5365   else
5366     {
5367       begp_orig = begp = BYTE_POS_ADDR (*beg);
5368       endp_orig = endp = begp + *end - *beg;
5369     }
5370
5371   eol_conversion = (coding->eol_type == CODING_EOL_CR
5372                     || coding->eol_type == CODING_EOL_CRLF);
5373
5374   /* Here, we don't have to check coding->pre_write_conversion because
5375      the caller is expected to have handled it already.  */
5376   switch (coding->type)
5377     {
5378     case coding_type_iso2022:
5379       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5380         /* We can't skip any data.  */
5381         break;
5382       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5383         {
5384           unsigned char *bol = begp;
5385           while (begp < endp && *begp < 0x80)
5386             {
5387               begp++;
5388               if (begp[-1] == '\n')
5389                 bol = begp;
5390             }
5391           begp = bol;
5392           goto label_skip_tail;
5393         }
5394       /* fall down ... */
5395
5396     case coding_type_sjis:
5397     case coding_type_big5:
5398       /* We can skip all ASCII characters at the head and tail.  */
5399       if (eol_conversion)
5400         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5401       else
5402         while (begp < endp && *begp < 0x80) begp++;
5403     label_skip_tail:
5404       if (eol_conversion)
5405         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5406       else
5407         while (begp < endp && *(endp - 1) < 0x80) endp--;
5408       break;
5409
5410     default:
5411       abort ();
5412     }
5413
5414   *beg += begp - begp_orig;
5415   *end += endp - endp_orig;
5416   return;
5417 }
5418
5419 /* As shrinking conversion region requires some overhead, we don't try
5420    shrinking if the length of conversion region is less than this
5421    value.  */
5422 static int shrink_conversion_region_threshhold = 1024;
5423
5424 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5425   do {                                                                  \
5426     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5427       {                                                                 \
5428         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5429         else shrink_decoding_region (beg, end, coding, str);            \
5430       }                                                                 \
5431   } while (0)
5432
5433 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5434    Vlast_coding_system_used and the remaining elements are buffers to
5435    kill.  */
5436 static Lisp_Object
5437 code_convert_region_unwind (arg)
5438      Lisp_Object arg;
5439 {
5440   struct gcpro gcpro1;
5441   GCPRO1 (arg);
5442
5443   inhibit_pre_post_conversion = 0;
5444   Vlast_coding_system_used = XCAR (arg);
5445   for (arg = XCDR (arg); CONSP (arg); arg = XCDR (arg))
5446     Fkill_buffer (XCAR (arg));
5447
5448   UNGCPRO;
5449   return Qnil;
5450 }
5451
5452 /* Store information about all compositions in the range FROM and TO
5453    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5454    buffer or a string, defaults to the current buffer.  */
5455
5456 void
5457 coding_save_composition (coding, from, to, obj)
5458      struct coding_system *coding;
5459      int from, to;
5460      Lisp_Object obj;
5461 {
5462   Lisp_Object prop;
5463   int start, end;
5464
5465   if (coding->composing == COMPOSITION_DISABLED)
5466     return;
5467   if (!coding->cmp_data)
5468     coding_allocate_composition_data (coding, from);
5469   if (!find_composition (from, to, &start, &end, &prop, obj)
5470       || end > to)
5471     return;
5472   if (start < from
5473       && (!find_composition (end, to, &start, &end, &prop, obj)
5474           || end > to))
5475     return;
5476   coding->composing = COMPOSITION_NO;
5477   do
5478     {
5479       if (COMPOSITION_VALID_P (start, end, prop))
5480         {
5481           enum composition_method method = COMPOSITION_METHOD (prop);
5482           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5483               >= COMPOSITION_DATA_SIZE)
5484             coding_allocate_composition_data (coding, from);
5485           /* For relative composition, we remember start and end
5486              positions, for the other compositions, we also remember
5487              components.  */
5488           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5489           if (method != COMPOSITION_RELATIVE)
5490             {
5491               /* We must store a*/
5492               Lisp_Object val, ch;
5493
5494               val = COMPOSITION_COMPONENTS (prop);
5495               if (CONSP (val))
5496                 while (CONSP (val))
5497                   {
5498                     ch = XCAR (val), val = XCDR (val);
5499                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5500                   }
5501               else if (VECTORP (val) || STRINGP (val))
5502                 {
5503                   int len = (VECTORP (val)
5504                              ? XVECTOR (val)->size : SCHARS (val));
5505                   int i;
5506                   for (i = 0; i < len; i++)
5507                     {
5508                       ch = (STRINGP (val)
5509                             ? Faref (val, make_number (i))
5510                             : XVECTOR (val)->contents[i]);
5511                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5512                     }
5513                 }
5514               else              /* INTEGERP (val) */
5515                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5516             }
5517           CODING_ADD_COMPOSITION_END (coding, end - from);
5518         }
5519       start = end;
5520     }
5521   while (start < to
5522          && find_composition (start, to, &start, &end, &prop, obj)
5523          && end <= to);
5524
5525   /* Make coding->cmp_data point to the first memory block.  */
5526   while (coding->cmp_data->prev)
5527     coding->cmp_data = coding->cmp_data->prev;
5528   coding->cmp_data_start = 0;
5529 }
5530
5531 /* Reflect the saved information about compositions to OBJ.
5532    CODING->cmp_data points to a memory block for the information.  OBJ
5533    is a buffer or a string, defaults to the current buffer.  */
5534
5535 void
5536 coding_restore_composition (coding, obj)
5537      struct coding_system *coding;
5538      Lisp_Object obj;
5539 {
5540   struct composition_data *cmp_data = coding->cmp_data;
5541
5542   if (!cmp_data)
5543     return;
5544
5545   while (cmp_data->prev)
5546     cmp_data = cmp_data->prev;
5547
5548   while (cmp_data)
5549     {
5550       int i;
5551
5552       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5553            i += cmp_data->data[i])
5554         {
5555           int *data = cmp_data->data + i;
5556           enum composition_method method = (enum composition_method) data[3];
5557           Lisp_Object components;
5558
5559           if (data[0] < 0 || i + data[0] > cmp_data->used)
5560             /* Invalid composition data.  */
5561             break;
5562
5563           if (method == COMPOSITION_RELATIVE)
5564             components = Qnil;
5565           else
5566             {
5567               int len = data[0] - 4, j;
5568               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5569
5570               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5571                   && len % 2 == 0)
5572                 len --;
5573               if (len < 1)
5574                 /* Invalid composition data.  */
5575                 break;
5576               for (j = 0; j < len; j++)
5577                 args[j] = make_number (data[4 + j]);
5578               components = (method == COMPOSITION_WITH_ALTCHARS
5579                             ? Fstring (len, args)
5580                             : Fvector (len, args));
5581             }
5582           compose_text (data[1], data[2], components, Qnil, obj);
5583         }
5584       cmp_data = cmp_data->next;
5585     }
5586 }
5587
5588 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5589    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5590    coding system CODING, and return the status code of code conversion
5591    (currently, this value has no meaning).
5592
5593    How many characters (and bytes) are converted to how many
5594    characters (and bytes) are recorded in members of the structure
5595    CODING.
5596
5597    If REPLACE is nonzero, we do various things as if the original text
5598    is deleted and a new text is inserted.  See the comments in
5599    replace_range (insdel.c) to know what we are doing.
5600
5601    If REPLACE is zero, it is assumed that the source text is unibyte.
5602    Otherwise, it is assumed that the source text is multibyte.  */
5603
5604 int
5605 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5606      int from, from_byte, to, to_byte, encodep, replace;
5607      struct coding_system *coding;
5608 {
5609   int len = to - from, len_byte = to_byte - from_byte;
5610   int nchars_del = 0, nbytes_del = 0;
5611   int require, inserted, inserted_byte;
5612   int head_skip, tail_skip, total_skip = 0;
5613   Lisp_Object saved_coding_symbol;
5614   int first = 1;
5615   unsigned char *src, *dst;
5616   Lisp_Object deletion;
5617   int orig_point = PT, orig_len = len;
5618   int prev_Z;
5619   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5620
5621   deletion = Qnil;
5622   saved_coding_symbol = coding->symbol;
5623
5624   if (from < PT && PT < to)
5625     {
5626       TEMP_SET_PT_BOTH (from, from_byte);
5627       orig_point = from;
5628     }
5629
5630   if (replace)
5631     {
5632       int saved_from = from;
5633       int saved_inhibit_modification_hooks;
5634
5635       prepare_to_modify_buffer (from, to, &from);
5636       if (saved_from != from)
5637         {
5638           to = from + len;
5639           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5640           len_byte = to_byte - from_byte;
5641         }
5642
5643       /* The code conversion routine can not preserve text properties
5644          for now.  So, we must remove all text properties in the
5645          region.  Here, we must suppress all modification hooks.  */
5646       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5647       inhibit_modification_hooks = 1;
5648       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5649       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5650     }
5651
5652   coding->heading_ascii = 0;
5653
5654   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5655     {
5656       /* We must detect encoding of text and eol format.  */
5657
5658       if (from < GPT && to > GPT)
5659         move_gap_both (from, from_byte);
5660       if (coding->type == coding_type_undecided)
5661         {
5662           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5663           if (coding->type == coding_type_undecided)
5664             {
5665               /* It seems that the text contains only ASCII, but we
5666                  should not leave it undecided because the deeper
5667                  decoding routine (decode_coding) tries to detect the
5668                  encodings again in vain.  */
5669               coding->type = coding_type_emacs_mule;
5670               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5671               /* As emacs-mule decoder will handle composition, we
5672                  need this setting to allocate coding->cmp_data
5673                  later.  */
5674               coding->composing = COMPOSITION_NO;
5675             }
5676         }
5677       if (coding->eol_type == CODING_EOL_UNDECIDED
5678           && coding->type != coding_type_ccl)
5679         {
5680           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5681           if (coding->eol_type == CODING_EOL_UNDECIDED)
5682             coding->eol_type = CODING_EOL_LF;
5683           /* We had better recover the original eol format if we
5684              encounter an inconsistent eol format while decoding.  */
5685           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5686         }
5687     }
5688
5689   /* Now we convert the text.  */
5690
5691   /* For encoding, we must process pre-write-conversion in advance.  */
5692   if (! inhibit_pre_post_conversion
5693       && encodep
5694       && SYMBOLP (coding->pre_write_conversion)
5695       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5696     {
5697       /* The function in pre-write-conversion may put a new text in a
5698          new buffer.  */
5699       struct buffer *prev = current_buffer;
5700       Lisp_Object new;
5701
5702       record_unwind_protect (code_convert_region_unwind,
5703                              Fcons (Vlast_coding_system_used, Qnil));
5704       /* We should not call any more pre-write/post-read-conversion
5705          functions while this pre-write-conversion is running.  */
5706       inhibit_pre_post_conversion = 1;
5707       call2 (coding->pre_write_conversion,
5708              make_number (from), make_number (to));
5709       inhibit_pre_post_conversion = 0;
5710       /* Discard the unwind protect.  */
5711       specpdl_ptr--;
5712
5713       if (current_buffer != prev)
5714         {
5715           len = ZV - BEGV;
5716           new = Fcurrent_buffer ();
5717           set_buffer_internal_1 (prev);
5718           del_range_2 (from, from_byte, to, to_byte, 0);
5719           TEMP_SET_PT_BOTH (from, from_byte);
5720           insert_from_buffer (XBUFFER (new), 1, len, 0);
5721           Fkill_buffer (new);
5722           if (orig_point >= to)
5723             orig_point += len - orig_len;
5724           else if (orig_point > from)
5725             orig_point = from;
5726           orig_len = len;
5727           to = from + len;
5728           from_byte = CHAR_TO_BYTE (from);
5729           to_byte = CHAR_TO_BYTE (to);
5730           len_byte = to_byte - from_byte;
5731           TEMP_SET_PT_BOTH (from, from_byte);
5732         }
5733     }
5734
5735   if (replace)
5736     {
5737       if (! EQ (current_buffer->undo_list, Qt))
5738         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5739       else
5740         {
5741           nchars_del = to - from;
5742           nbytes_del = to_byte - from_byte;
5743         }
5744     }
5745
5746   if (coding->composing != COMPOSITION_DISABLED)
5747     {
5748       if (encodep)
5749         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5750       else
5751         coding_allocate_composition_data (coding, from);
5752     }
5753
5754   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5755      if we must run CCL program or there are compositions to
5756      encode.  */
5757   if (coding->type != coding_type_ccl
5758       && (! coding->cmp_data || coding->cmp_data->used == 0))
5759     {
5760       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5761
5762       if (from < GPT && GPT < to)
5763         move_gap_both (from, from_byte);
5764       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5765       if (from_byte == to_byte
5766           && (encodep || NILP (coding->post_read_conversion))
5767           && ! CODING_REQUIRE_FLUSHING (coding))
5768         {
5769           coding->produced = len_byte;
5770           coding->produced_char = len;
5771           if (!replace)
5772             /* We must record and adjust for this new text now.  */
5773             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5774           coding_free_composition_data (coding);
5775           return 0;
5776         }
5777
5778       head_skip = from_byte - from_byte_orig;
5779       tail_skip = to_byte_orig - to_byte;
5780       total_skip = head_skip + tail_skip;
5781       from += head_skip;
5782       to -= tail_skip;
5783       len -= total_skip; len_byte -= total_skip;
5784     }
5785
5786   /* For conversion, we must put the gap before the text in addition to
5787      making the gap larger for efficient decoding.  The required gap
5788      size starts from 2000 which is the magic number used in make_gap.
5789      But, after one batch of conversion, it will be incremented if we
5790      find that it is not enough .  */
5791   require = 2000;
5792
5793   if (GAP_SIZE  < require)
5794     make_gap (require - GAP_SIZE);
5795   move_gap_both (from, from_byte);
5796
5797   inserted = inserted_byte = 0;
5798
5799   GAP_SIZE += len_byte;
5800   ZV -= len;
5801   Z -= len;
5802   ZV_BYTE -= len_byte;
5803   Z_BYTE -= len_byte;
5804
5805   if (GPT - BEG < BEG_UNCHANGED)
5806     BEG_UNCHANGED = GPT - BEG;
5807   if (Z - GPT < END_UNCHANGED)
5808     END_UNCHANGED = Z - GPT;
5809
5810   if (!encodep && coding->src_multibyte)
5811     {
5812       /* Decoding routines expects that the source text is unibyte.
5813          We must convert 8-bit characters of multibyte form to
5814          unibyte.  */
5815       int len_byte_orig = len_byte;
5816       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5817       if (len_byte < len_byte_orig)
5818         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5819                     len_byte);
5820       coding->src_multibyte = 0;
5821     }
5822
5823   for (;;)
5824     {
5825       int result;
5826
5827       /* The buffer memory is now:
5828          +--------+converted-text+---------+-------original-text-------+---+
5829          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5830                   |<---------------------- GAP ----------------------->|  */
5831       src = GAP_END_ADDR - len_byte;
5832       dst = GPT_ADDR + inserted_byte;
5833
5834       if (encodep)
5835         result = encode_coding (coding, src, dst, len_byte, 0);
5836       else
5837         {
5838           if (coding->composing != COMPOSITION_DISABLED)
5839             coding->cmp_data->char_offset = from + inserted;
5840           result = decode_coding (coding, src, dst, len_byte, 0);
5841         }
5842
5843       /* The buffer memory is now:
5844          +--------+-------converted-text----+--+------original-text----+---+
5845          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5846                   |<---------------------- GAP ----------------------->|  */
5847
5848       inserted += coding->produced_char;
5849       inserted_byte += coding->produced;
5850       len_byte -= coding->consumed;
5851
5852       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5853         {
5854           coding_allocate_composition_data (coding, from + inserted);
5855           continue;
5856         }
5857
5858       src += coding->consumed;
5859       dst += coding->produced;
5860
5861       if (result == CODING_FINISH_NORMAL)
5862         {
5863           src += len_byte;
5864           break;
5865         }
5866       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5867         {
5868           unsigned char *pend = dst, *p = pend - inserted_byte;
5869           Lisp_Object eol_type;
5870
5871           /* Encode LFs back to the original eol format (CR or CRLF).  */
5872           if (coding->eol_type == CODING_EOL_CR)
5873             {
5874               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5875             }
5876           else
5877             {
5878               int count = 0;
5879
5880               while (p < pend) if (*p++ == '\n') count++;
5881               if (src - dst < count)
5882                 {
5883                   /* We don't have sufficient room for encoding LFs
5884                      back to CRLF.  We must record converted and
5885                      not-yet-converted text back to the buffer
5886                      content, enlarge the gap, then record them out of
5887                      the buffer contents again.  */
5888                   int add = len_byte + inserted_byte;
5889
5890                   GAP_SIZE -= add;
5891                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5892                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5893                   make_gap (count - GAP_SIZE);
5894                   GAP_SIZE += add;
5895                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5896                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5897                   /* Don't forget to update SRC, DST, and PEND.  */
5898                   src = GAP_END_ADDR - len_byte;
5899                   dst = GPT_ADDR + inserted_byte;
5900                   pend = dst;
5901                 }
5902               inserted += count;
5903               inserted_byte += count;
5904               coding->produced += count;
5905               p = dst = pend + count;
5906               while (count)
5907                 {
5908                   *--p = *--pend;
5909                   if (*p == '\n') count--, *--p = '\r';
5910                 }
5911             }
5912
5913           /* Suppress eol-format conversion in the further conversion.  */
5914           coding->eol_type = CODING_EOL_LF;
5915
5916           /* Set the coding system symbol to that for Unix-like EOL.  */
5917           eol_type = Fget (saved_coding_symbol, Qeol_type);
5918           if (VECTORP (eol_type)
5919               && XVECTOR (eol_type)->size == 3
5920               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5921             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5922           else
5923             coding->symbol = saved_coding_symbol;
5924
5925           continue;
5926         }
5927       if (len_byte <= 0)
5928         {
5929           if (coding->type != coding_type_ccl
5930               || coding->mode & CODING_MODE_LAST_BLOCK)
5931             break;
5932           coding->mode |= CODING_MODE_LAST_BLOCK;
5933           continue;
5934         }
5935       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5936         {
5937           /* The source text ends in invalid codes.  Let's just
5938              make them valid buffer contents, and finish conversion.  */
5939           if (multibyte_p)
5940             {
5941               unsigned char *start = dst;
5942
5943               inserted += len_byte;
5944               while (len_byte--)
5945                 {
5946                   int c = *src++;
5947                   dst += CHAR_STRING (c, dst);
5948                 }
5949
5950               inserted_byte += dst - start;
5951             }
5952           else
5953             {
5954               inserted += len_byte;
5955               inserted_byte += len_byte;
5956               while (len_byte--)
5957                 *dst++ = *src++;
5958             }
5959           break;
5960         }
5961       if (result == CODING_FINISH_INTERRUPT)
5962         {
5963           /* The conversion procedure was interrupted by a user.  */
5964           break;
5965         }
5966       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5967       if (coding->consumed < 1)
5968         {
5969           /* It's quite strange to require more memory without
5970              consuming any bytes.  Perhaps CCL program bug.  */
5971           break;
5972         }
5973       if (first)
5974         {
5975           /* We have just done the first batch of conversion which was
5976              stopped because of insufficient gap.  Let's reconsider the
5977              required gap size (i.e. SRT - DST) now.
5978
5979              We have converted ORIG bytes (== coding->consumed) into
5980              NEW bytes (coding->produced).  To convert the remaining
5981              LEN bytes, we may need REQUIRE bytes of gap, where:
5982                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5983                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5984              Here, we are sure that NEW >= ORIG.  */
5985
5986           if (coding->produced <= coding->consumed)
5987             {
5988               /* This happens because of CCL-based coding system with
5989                  eol-type CRLF.  */
5990               require = 0;
5991             }
5992           else
5993             {
5994               float ratio = coding->produced - coding->consumed;
5995               ratio /= coding->consumed;
5996               require = len_byte * ratio;
5997             }
5998           first = 0;
5999         }
6000       if ((src - dst) < (require + 2000))
6001         {
6002           /* See the comment above the previous call of make_gap.  */
6003           int add = len_byte + inserted_byte;
6004
6005           GAP_SIZE -= add;
6006           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
6007           GPT += inserted_byte; GPT_BYTE += inserted_byte;
6008           make_gap (require + 2000);
6009           GAP_SIZE += add;
6010           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
6011           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
6012         }
6013     }
6014   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
6015
6016   if (encodep && coding->dst_multibyte)
6017     {
6018       /* The output is unibyte.  We must convert 8-bit characters to
6019          multibyte form.  */
6020       if (inserted_byte * 2 > GAP_SIZE)
6021         {
6022           GAP_SIZE -= inserted_byte;
6023           ZV += inserted_byte; Z += inserted_byte;
6024           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
6025           GPT += inserted_byte; GPT_BYTE += inserted_byte;
6026           make_gap (inserted_byte - GAP_SIZE);
6027           GAP_SIZE += inserted_byte;
6028           ZV -= inserted_byte; Z -= inserted_byte;
6029           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
6030           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
6031         }
6032       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
6033     }
6034
6035   /* If we shrank the conversion area, adjust it now.  */
6036   if (total_skip > 0)
6037     {
6038       if (tail_skip > 0)
6039         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
6040       inserted += total_skip; inserted_byte += total_skip;
6041       GAP_SIZE += total_skip;
6042       GPT -= head_skip; GPT_BYTE -= head_skip;
6043       ZV -= total_skip; ZV_BYTE -= total_skip;
6044       Z -= total_skip; Z_BYTE -= total_skip;
6045       from -= head_skip; from_byte -= head_skip;
6046       to += tail_skip; to_byte += tail_skip;
6047     }
6048
6049   prev_Z = Z;
6050   if (! EQ (current_buffer->undo_list, Qt))
6051     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6052   else
6053     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
6054                                  inserted, inserted_byte);
6055   inserted = Z - prev_Z;
6056
6057   if (!encodep && coding->cmp_data && coding->cmp_data->used)
6058     coding_restore_composition (coding, Fcurrent_buffer ());
6059   coding_free_composition_data (coding);
6060
6061   if (! inhibit_pre_post_conversion
6062       && ! encodep && ! NILP (coding->post_read_conversion))
6063     {
6064       Lisp_Object val;
6065       Lisp_Object saved_coding_system;
6066
6067       if (from != PT)
6068         TEMP_SET_PT_BOTH (from, from_byte);
6069       prev_Z = Z;
6070       record_unwind_protect (code_convert_region_unwind,
6071                              Fcons (Vlast_coding_system_used, Qnil));
6072       saved_coding_system = Vlast_coding_system_used;
6073       Vlast_coding_system_used = coding->symbol;
6074       /* We should not call any more pre-write/post-read-conversion
6075          functions while this post-read-conversion is running.  */
6076       inhibit_pre_post_conversion = 1;
6077       val = call1 (coding->post_read_conversion, make_number (inserted));
6078       inhibit_pre_post_conversion = 0;
6079       coding->symbol = Vlast_coding_system_used;
6080       Vlast_coding_system_used = saved_coding_system;
6081       /* Discard the unwind protect.  */
6082       specpdl_ptr--;
6083       CHECK_NUMBER (val);
6084       inserted += Z - prev_Z;
6085     }
6086
6087   if (orig_point >= from)
6088     {
6089       if (orig_point >= from + orig_len)
6090         orig_point += inserted - orig_len;
6091       else
6092         orig_point = from;
6093       TEMP_SET_PT (orig_point);
6094     }
6095
6096   if (replace)
6097     {
6098       signal_after_change (from, to - from, inserted);
6099       update_compositions (from, from + inserted, CHECK_BORDER);
6100     }
6101
6102   {
6103     coding->consumed = to_byte - from_byte;
6104     coding->consumed_char = to - from;
6105     coding->produced = inserted_byte;
6106     coding->produced_char = inserted;
6107   }
6108
6109   return 0;
6110 }
6111
6112 /* Name (or base name) of work buffer for code conversion.  */
6113 static Lisp_Object Vcode_conversion_workbuf_name;
6114
6115 /* Set the current buffer to the working buffer prepared for
6116    code-conversion.  MULTIBYTE specifies the multibyteness of the
6117    buffer.  Return the buffer we set if it must be killed after use.
6118    Otherwise return Qnil.  */
6119
6120 static Lisp_Object
6121 set_conversion_work_buffer (multibyte)
6122      int multibyte;
6123 {
6124   Lisp_Object buffer, buffer_to_kill;
6125   struct buffer *buf;
6126
6127   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6128   buf = XBUFFER (buffer);
6129   if (buf == current_buffer)
6130     {
6131       /* As we are already in the work buffer, we must generate a new
6132          buffer for the work.  */
6133       Lisp_Object name;
6134
6135       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6136       buffer = buffer_to_kill = Fget_buffer_create (name);
6137       buf = XBUFFER (buffer);
6138     }
6139   else
6140     buffer_to_kill = Qnil;
6141
6142   delete_all_overlays (buf);
6143   buf->directory = current_buffer->directory;
6144   buf->read_only = Qnil;
6145   buf->filename = Qnil;
6146   buf->undo_list = Qt;
6147   eassert (buf->overlays_before == NULL);
6148   eassert (buf->overlays_after == NULL);
6149   set_buffer_internal (buf);
6150   if (BEG != BEGV || Z != ZV)
6151     Fwiden ();
6152   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6153   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6154   return buffer_to_kill;
6155 }
6156
6157 Lisp_Object
6158 run_pre_post_conversion_on_str (str, coding, encodep)
6159      Lisp_Object str;
6160      struct coding_system *coding;
6161      int encodep;
6162 {
6163   int count = SPECPDL_INDEX ();
6164   struct gcpro gcpro1, gcpro2;
6165   int multibyte = STRING_MULTIBYTE (str);
6166   Lisp_Object old_deactivate_mark;
6167   Lisp_Object buffer_to_kill;
6168   Lisp_Object unwind_arg;
6169
6170   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6171   /* It is not crucial to specbind this.  */
6172   old_deactivate_mark = Vdeactivate_mark;
6173   GCPRO2 (str, old_deactivate_mark);
6174
6175   /* We must insert the contents of STR as is without
6176      unibyte<->multibyte conversion.  For that, we adjust the
6177      multibyteness of the working buffer to that of STR.  */
6178   buffer_to_kill = set_conversion_work_buffer (multibyte);
6179   if (NILP (buffer_to_kill))
6180     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6181   else
6182     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6183   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6184
6185   insert_from_string (str, 0, 0,
6186                       SCHARS (str), SBYTES (str), 0);
6187   UNGCPRO;
6188   inhibit_pre_post_conversion = 1;
6189   if (encodep)
6190     {
6191       struct buffer *prev = current_buffer;
6192
6193       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6194       if (prev != current_buffer)
6195         /* We must kill the current buffer too.  */
6196         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6197     }
6198   else
6199     {
6200       Vlast_coding_system_used = coding->symbol;
6201       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6202       call1 (coding->post_read_conversion, make_number (Z - BEG));
6203       coding->symbol = Vlast_coding_system_used;
6204     }
6205   inhibit_pre_post_conversion = 0;
6206   Vdeactivate_mark = old_deactivate_mark;
6207   str = make_buffer_string (BEG, Z, 1);
6208   return unbind_to (count, str);
6209 }
6210
6211
6212 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6213    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6214    is intended that this function is called from encode_terminal_code,
6215    the pre-write-conversion function is run by safe_call and thus
6216    "Error during redisplay: ..." is logged when an error occurs.
6217
6218    Store the resulting text in *STR and set CODING->produced_char and
6219    CODING->produced to the number of characters and bytes
6220    respectively.  If the size of *STR is too small, enlarge it by
6221    xrealloc and update *STR and *SIZE.  */
6222
6223 void
6224 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6225      unsigned char **str;
6226      int *size, nchars, nbytes;
6227      struct coding_system *coding;
6228 {
6229   struct gcpro gcpro1, gcpro2;
6230   struct buffer *cur = current_buffer;
6231   struct buffer *prev;
6232   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6233   Lisp_Object args[3];
6234   Lisp_Object buffer_to_kill;
6235
6236   /* It is not crucial to specbind this.  */
6237   old_deactivate_mark = Vdeactivate_mark;
6238   old_last_coding_system_used = Vlast_coding_system_used;
6239   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6240
6241   /* We must insert the contents of STR as is without
6242      unibyte<->multibyte conversion.  For that, we adjust the
6243      multibyteness of the working buffer to that of STR.  */
6244   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6245   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6246   UNGCPRO;
6247   inhibit_pre_post_conversion = 1;
6248   prev = current_buffer;
6249   args[0] = coding->pre_write_conversion;
6250   args[1] = make_number (BEG);
6251   args[2] = make_number (Z);
6252   safe_call (3, args);
6253   inhibit_pre_post_conversion = 0;
6254   Vdeactivate_mark = old_deactivate_mark;
6255   Vlast_coding_system_used = old_last_coding_system_used;
6256   coding->produced_char = Z - BEG;
6257   coding->produced = Z_BYTE - BEG_BYTE;
6258   if (coding->produced > *size)
6259     {
6260       *size = coding->produced;
6261       *str = xrealloc (*str, *size);
6262     }
6263   if (BEG < GPT && GPT < Z)
6264     move_gap (BEG);
6265   bcopy (BEG_ADDR, *str, coding->produced);
6266   coding->src_multibyte
6267     = ! NILP (current_buffer->enable_multibyte_characters);
6268   if (prev != current_buffer)
6269     Fkill_buffer (Fcurrent_buffer ());
6270   set_buffer_internal (cur);
6271   if (! NILP (buffer_to_kill))
6272     Fkill_buffer (buffer_to_kill);
6273 }
6274
6275
6276 Lisp_Object
6277 decode_coding_string (str, coding, nocopy)
6278      Lisp_Object str;
6279      struct coding_system *coding;
6280      int nocopy;
6281 {
6282   int len;
6283   struct conversion_buffer buf;
6284   int from, to_byte;
6285   Lisp_Object saved_coding_symbol;
6286   int result;
6287   int require_decoding;
6288   int shrinked_bytes = 0;
6289   Lisp_Object newstr;
6290   int consumed, consumed_char, produced, produced_char;
6291
6292   from = 0;
6293   to_byte = SBYTES (str);
6294
6295   saved_coding_symbol = coding->symbol;
6296   coding->src_multibyte = STRING_MULTIBYTE (str);
6297   coding->dst_multibyte = 1;
6298   coding->heading_ascii = 0;
6299
6300   if (CODING_REQUIRE_DETECTION (coding))
6301     {
6302       /* See the comments in code_convert_region.  */
6303       if (coding->type == coding_type_undecided)
6304         {
6305           detect_coding (coding, SDATA (str), to_byte);
6306           if (coding->type == coding_type_undecided)
6307             {
6308               coding->type = coding_type_emacs_mule;
6309               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6310               /* As emacs-mule decoder will handle composition, we
6311                  need this setting to allocate coding->cmp_data
6312                  later.  */
6313               coding->composing = COMPOSITION_NO;
6314             }
6315         }
6316       if (coding->eol_type == CODING_EOL_UNDECIDED
6317           && coding->type != coding_type_ccl)
6318         {
6319           saved_coding_symbol = coding->symbol;
6320           detect_eol (coding, SDATA (str), to_byte);
6321           if (coding->eol_type == CODING_EOL_UNDECIDED)
6322             coding->eol_type = CODING_EOL_LF;
6323           /* We had better recover the original eol format if we
6324              encounter an inconsistent eol format while decoding.  */
6325           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6326         }
6327     }
6328
6329   if (coding->type == coding_type_no_conversion
6330       || coding->type == coding_type_raw_text)
6331     coding->dst_multibyte = 0;
6332
6333   require_decoding = CODING_REQUIRE_DECODING (coding);
6334
6335   if (STRING_MULTIBYTE (str))
6336     {
6337       /* Decoding routines expect the source text to be unibyte.  */
6338       str = Fstring_as_unibyte (str);
6339       to_byte = SBYTES (str);
6340       nocopy = 1;
6341       coding->src_multibyte = 0;
6342     }
6343
6344   /* Try to skip the heading and tailing ASCIIs.  */
6345   if (require_decoding && coding->type != coding_type_ccl)
6346     {
6347       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6348                                 0);
6349       if (from == to_byte)
6350         require_decoding = 0;
6351       shrinked_bytes = from + (SBYTES (str) - to_byte);
6352     }
6353
6354   if (!require_decoding
6355       && !(SYMBOLP (coding->post_read_conversion)
6356            && !NILP (Ffboundp (coding->post_read_conversion))))
6357     {
6358       coding->consumed = SBYTES (str);
6359       coding->consumed_char = SCHARS (str);
6360       if (coding->dst_multibyte)
6361         {
6362           str = Fstring_as_multibyte (str);
6363           nocopy = 1;
6364         }
6365       coding->produced = SBYTES (str);
6366       coding->produced_char = SCHARS (str);
6367       return (nocopy ? str : Fcopy_sequence (str));
6368     }
6369
6370   if (coding->composing != COMPOSITION_DISABLED)
6371     coding_allocate_composition_data (coding, from);
6372   len = decoding_buffer_size (coding, to_byte - from);
6373   allocate_conversion_buffer (buf, len);
6374
6375   consumed = consumed_char = produced = produced_char = 0;
6376   while (1)
6377     {
6378       result = decode_coding (coding, SDATA (str) + from + consumed,
6379                               buf.data + produced, to_byte - from - consumed,
6380                               buf.size - produced);
6381       consumed += coding->consumed;
6382       consumed_char += coding->consumed_char;
6383       produced += coding->produced;
6384       produced_char += coding->produced_char;
6385       if (result == CODING_FINISH_NORMAL
6386           || result == CODING_FINISH_INTERRUPT
6387           || (result == CODING_FINISH_INSUFFICIENT_SRC
6388               && coding->consumed == 0))
6389         break;
6390       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6391         coding_allocate_composition_data (coding, from + produced_char);
6392       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6393         extend_conversion_buffer (&buf);
6394       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6395         {
6396           Lisp_Object eol_type;
6397
6398           /* Recover the original EOL format.  */
6399           if (coding->eol_type == CODING_EOL_CR)
6400             {
6401               unsigned char *p;
6402               for (p = buf.data; p < buf.data + produced; p++)
6403                 if (*p == '\n') *p = '\r';
6404             }
6405           else if (coding->eol_type == CODING_EOL_CRLF)
6406             {
6407               int num_eol = 0;
6408               unsigned char *p0, *p1;
6409               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6410                 if (*p0 == '\n') num_eol++;
6411               if (produced + num_eol >= buf.size)
6412                 extend_conversion_buffer (&buf);
6413               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6414                 {
6415                   *--p1 = *--p0;
6416                   if (*p0 == '\n') *--p1 = '\r';
6417                 }
6418               produced += num_eol;
6419               produced_char += num_eol;
6420             }
6421           /* Suppress eol-format conversion in the further conversion.  */
6422           coding->eol_type = CODING_EOL_LF;
6423
6424           /* Set the coding system symbol to that for Unix-like EOL.  */
6425           eol_type = Fget (saved_coding_symbol, Qeol_type);
6426           if (VECTORP (eol_type)
6427               && XVECTOR (eol_type)->size == 3
6428               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6429             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6430           else
6431             coding->symbol = saved_coding_symbol;
6432
6433
6434         }
6435     }
6436
6437   coding->consumed = consumed;
6438   coding->consumed_char = consumed_char;
6439   coding->produced = produced;
6440   coding->produced_char = produced_char;
6441
6442   if (coding->dst_multibyte)
6443     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6444                                            produced + shrinked_bytes);
6445   else
6446     newstr = make_uninit_string (produced + shrinked_bytes);
6447   if (from > 0)
6448     STRING_COPYIN (newstr, 0, SDATA (str), from);
6449   STRING_COPYIN (newstr, from, buf.data, produced);
6450   if (shrinked_bytes > from)
6451     STRING_COPYIN (newstr, from + produced,
6452                    SDATA (str) + to_byte,
6453                    shrinked_bytes - from);
6454   free_conversion_buffer (&buf);
6455
6456   coding->consumed += shrinked_bytes;
6457   coding->consumed_char += shrinked_bytes;
6458   coding->produced += shrinked_bytes;
6459   coding->produced_char += shrinked_bytes;
6460
6461   if (coding->cmp_data && coding->cmp_data->used)
6462     coding_restore_composition (coding, newstr);
6463   coding_free_composition_data (coding);
6464
6465   if (SYMBOLP (coding->post_read_conversion)
6466       && !NILP (Ffboundp (coding->post_read_conversion)))
6467     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6468
6469   return newstr;
6470 }
6471
6472 Lisp_Object
6473 encode_coding_string (str, coding, nocopy)
6474      Lisp_Object str;
6475      struct coding_system *coding;
6476      int nocopy;
6477 {
6478   int len;
6479   struct conversion_buffer buf;
6480   int from, to, to_byte;
6481   int result;
6482   int shrinked_bytes = 0;
6483   Lisp_Object newstr;
6484   int consumed, consumed_char, produced, produced_char;
6485
6486   if (SYMBOLP (coding->pre_write_conversion)
6487       && !NILP (Ffboundp (coding->pre_write_conversion)))
6488     {
6489       str = run_pre_post_conversion_on_str (str, coding, 1);
6490       /* As STR is just newly generated, we don't have to copy it
6491          anymore.  */
6492       nocopy = 1;
6493     }
6494
6495   from = 0;
6496   to = SCHARS (str);
6497   to_byte = SBYTES (str);
6498
6499   /* Encoding routines determine the multibyteness of the source text
6500      by coding->src_multibyte.  */
6501   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6502   coding->dst_multibyte = 0;
6503   if (! CODING_REQUIRE_ENCODING (coding))
6504     goto no_need_of_encoding;
6505
6506   if (coding->composing != COMPOSITION_DISABLED)
6507     coding_save_composition (coding, from, to, str);
6508
6509   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6510      if we must run CCL program or there are compositions to
6511      encode.  */
6512   coding->heading_ascii = 0;
6513   if (coding->type != coding_type_ccl
6514       && (! coding->cmp_data || coding->cmp_data->used == 0))
6515     {
6516       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6517                                 1);
6518       if (from == to_byte)
6519         {
6520           coding_free_composition_data (coding);
6521           goto no_need_of_encoding;
6522         }
6523       shrinked_bytes = from + (SBYTES (str) - to_byte);
6524     }
6525
6526   len = encoding_buffer_size (coding, to_byte - from);
6527   allocate_conversion_buffer (buf, len);
6528
6529   consumed = consumed_char = produced = produced_char = 0;
6530   while (1)
6531     {
6532       result = encode_coding (coding, SDATA (str) + from + consumed,
6533                               buf.data + produced, to_byte - from - consumed,
6534                               buf.size - produced);
6535       consumed += coding->consumed;
6536       consumed_char += coding->consumed_char;
6537       produced += coding->produced;
6538       produced_char += coding->produced_char;
6539       if (result == CODING_FINISH_NORMAL
6540           || result == CODING_FINISH_INTERRUPT
6541           || (result == CODING_FINISH_INSUFFICIENT_SRC
6542               && coding->consumed == 0))
6543         break;
6544       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6545       extend_conversion_buffer (&buf);
6546     }
6547
6548   coding->consumed = consumed;
6549   coding->consumed_char = consumed_char;
6550   coding->produced = produced;
6551   coding->produced_char = produced_char;
6552
6553   newstr = make_uninit_string (produced + shrinked_bytes);
6554   if (from > 0)
6555     STRING_COPYIN (newstr, 0, SDATA (str), from);
6556   STRING_COPYIN (newstr, from, buf.data, produced);
6557   if (shrinked_bytes > from)
6558     STRING_COPYIN (newstr, from + produced,
6559                    SDATA (str) + to_byte,
6560                    shrinked_bytes - from);
6561
6562   free_conversion_buffer (&buf);
6563   coding_free_composition_data (coding);
6564
6565   return newstr;
6566
6567  no_need_of_encoding:
6568   coding->consumed = SBYTES (str);
6569   coding->consumed_char = SCHARS (str);
6570   if (STRING_MULTIBYTE (str))
6571     {
6572       if (nocopy)
6573         /* We are sure that STR doesn't contain a multibyte
6574            character.  */
6575         STRING_SET_UNIBYTE (str);
6576       else
6577         {
6578           str = Fstring_as_unibyte (str);
6579           nocopy = 1;
6580         }
6581     }
6582   coding->produced = SBYTES (str);
6583   coding->produced_char = SCHARS (str);
6584   return (nocopy ? str : Fcopy_sequence (str));
6585 }
6586
6587 \f
6588 #ifdef emacs
6589 /*** 8. Emacs Lisp library functions ***/
6590
6591 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6592        doc: /* Return t if OBJECT is nil or a coding-system.
6593 See the documentation of `make-coding-system' for information
6594 about coding-system objects.  */)
6595      (obj)
6596      Lisp_Object obj;
6597 {
6598   if (NILP (obj))
6599     return Qt;
6600   if (!SYMBOLP (obj))
6601     return Qnil;
6602   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6603     return Qt;
6604   /* Get coding-spec vector for OBJ.  */
6605   obj = Fget (obj, Qcoding_system);
6606   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6607           ? Qt : Qnil);
6608 }
6609
6610 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6611        Sread_non_nil_coding_system, 1, 1, 0,
6612        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6613      (prompt)
6614      Lisp_Object prompt;
6615 {
6616   Lisp_Object val;
6617   do
6618     {
6619       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6620                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6621     }
6622   while (SCHARS (val) == 0);
6623   return (Fintern (val, Qnil));
6624 }
6625
6626 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6627        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6628 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
6629 Ignores case when completing coding systems (all Emacs coding systems
6630 are lower-case).  */)
6631      (prompt, default_coding_system)
6632      Lisp_Object prompt, default_coding_system;
6633 {
6634   Lisp_Object val;
6635   int count = SPECPDL_INDEX ();
6636
6637   if (SYMBOLP (default_coding_system))
6638     default_coding_system = SYMBOL_NAME (default_coding_system);
6639   specbind (Qcompletion_ignore_case, Qt);
6640   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6641                           Qt, Qnil, Qcoding_system_history,
6642                           default_coding_system, Qnil);
6643   unbind_to (count, Qnil);
6644   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6645 }
6646
6647 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6648        1, 1, 0,
6649        doc: /* Check validity of CODING-SYSTEM.
6650 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6651 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6652 The value of this property should be a vector of length 5.  */)
6653      (coding_system)
6654      Lisp_Object coding_system;
6655 {
6656   Lisp_Object define_form;
6657
6658   define_form = Fget (coding_system, Qcoding_system_define_form);
6659   if (! NILP (define_form))
6660     {
6661       Fput (coding_system, Qcoding_system_define_form, Qnil);
6662       safe_eval (define_form);
6663     }
6664   if (!NILP (Fcoding_system_p (coding_system)))
6665     return coding_system;
6666   xsignal1 (Qcoding_system_error, coding_system);
6667 }
6668 \f
6669 Lisp_Object
6670 detect_coding_system (src, src_bytes, highest, multibytep)
6671      const unsigned char *src;
6672      int src_bytes, highest;
6673      int multibytep;
6674 {
6675   int coding_mask, eol_type;
6676   Lisp_Object val, tmp;
6677   int dummy;
6678
6679   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6680   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6681   if (eol_type == CODING_EOL_INCONSISTENT)
6682     eol_type = CODING_EOL_UNDECIDED;
6683
6684   if (!coding_mask)
6685     {
6686       val = Qundecided;
6687       if (eol_type != CODING_EOL_UNDECIDED)
6688         {
6689           Lisp_Object val2;
6690           val2 = Fget (Qundecided, Qeol_type);
6691           if (VECTORP (val2))
6692             val = XVECTOR (val2)->contents[eol_type];
6693         }
6694       return (highest ? val : Fcons (val, Qnil));
6695     }
6696
6697   /* At first, gather possible coding systems in VAL.  */
6698   val = Qnil;
6699   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6700     {
6701       Lisp_Object category_val, category_index;
6702
6703       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6704       category_val = Fsymbol_value (XCAR (tmp));
6705       if (!NILP (category_val)
6706           && NATNUMP (category_index)
6707           && (coding_mask & (1 << XFASTINT (category_index))))
6708         {
6709           val = Fcons (category_val, val);
6710           if (highest)
6711             break;
6712         }
6713     }
6714   if (!highest)
6715     val = Fnreverse (val);
6716
6717   /* Then, replace the elements with subsidiary coding systems.  */
6718   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6719     {
6720       if (eol_type != CODING_EOL_UNDECIDED
6721           && eol_type != CODING_EOL_INCONSISTENT)
6722         {
6723           Lisp_Object eol;
6724           eol = Fget (XCAR (tmp), Qeol_type);
6725           if (VECTORP (eol))
6726             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6727         }
6728     }
6729   return (highest ? XCAR (val) : val);
6730 }
6731
6732 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6733        2, 3, 0,
6734        doc: /* Detect how the byte sequence in the region is encoded.
6735 Return a list of possible coding systems used on decoding a byte
6736 sequence containing the bytes in the region between START and END when
6737 the coding system `undecided' is specified.  The list is ordered by
6738 priority decided in the current language environment.
6739
6740 If only ASCII characters are found (except for such ISO-2022 control
6741 characters ISO-2022 as ESC), it returns a list of single element
6742 `undecided' or its subsidiary coding system according to a detected
6743 end-of-line format.
6744
6745 If optional argument HIGHEST is non-nil, return the coding system of
6746 highest priority.  */)
6747      (start, end, highest)
6748      Lisp_Object start, end, highest;
6749 {
6750   int from, to;
6751   int from_byte, to_byte;
6752   int include_anchor_byte = 0;
6753
6754   CHECK_NUMBER_COERCE_MARKER (start);
6755   CHECK_NUMBER_COERCE_MARKER (end);
6756
6757   validate_region (&start, &end);
6758   from = XINT (start), to = XINT (end);
6759   from_byte = CHAR_TO_BYTE (from);
6760   to_byte = CHAR_TO_BYTE (to);
6761
6762   if (from < GPT && to >= GPT)
6763     move_gap_both (to, to_byte);
6764   /* If we an anchor byte `\0' follows the region, we include it in
6765      the detecting source.  Then code detectors can handle the tailing
6766      byte sequence more accurately.
6767
6768      Fix me: This is not a perfect solution.  It is better that we
6769      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6770   */
6771   if (to == Z || (to == GPT && GAP_SIZE > 0))
6772     include_anchor_byte = 1;
6773   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6774                                to_byte - from_byte + include_anchor_byte,
6775                                !NILP (highest),
6776                                !NILP (current_buffer
6777                                       ->enable_multibyte_characters));
6778 }
6779
6780 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6781        1, 2, 0,
6782        doc: /* Detect how the byte sequence in STRING is encoded.
6783 Return a list of possible coding systems used on decoding a byte
6784 sequence containing the bytes in STRING when the coding system
6785 `undecided' is specified.  The list is ordered by priority decided in
6786 the current language environment.
6787
6788 If only ASCII characters are found (except for such ISO-2022 control
6789 characters ISO-2022 as ESC), it returns a list of single element
6790 `undecided' or its subsidiary coding system according to a detected
6791 end-of-line format.
6792
6793 If optional argument HIGHEST is non-nil, return the coding system of
6794 highest priority.  */)
6795      (string, highest)
6796      Lisp_Object string, highest;
6797 {
6798   CHECK_STRING (string);
6799
6800   return detect_coding_system (SDATA (string),
6801                                /* "+ 1" is to include the anchor byte
6802                                   `\0'.  With this, code detectors can
6803                                   handle the tailing bytes more
6804                                   accurately.  */
6805                                SBYTES (string) + 1,
6806                                !NILP (highest),
6807                                STRING_MULTIBYTE (string));
6808 }
6809
6810 /*  Subroutine for Ffind_coding_systems_region_internal.
6811
6812     Return a list of coding systems that safely encode the multibyte
6813     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6814     possible coding systems.  If it is nil, it means that we have not
6815     yet found any coding systems.
6816
6817     WORK_TABLE a char-table of which element is set to t once the
6818     element is looked up.
6819
6820     If a non-ASCII single byte char is found, set
6821     *single_byte_char_found to 1.  */
6822
6823 static Lisp_Object
6824 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6825      unsigned char *p, *pend;
6826      Lisp_Object safe_codings, work_table;
6827      int *single_byte_char_found;
6828 {
6829   int c, len;
6830   Lisp_Object val, ch;
6831   Lisp_Object prev, tail;
6832
6833   if (NILP (safe_codings))
6834     goto done_safe_codings;
6835   while (p < pend)
6836     {
6837       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6838       p += len;
6839       if (ASCII_BYTE_P (c))
6840         /* We can ignore ASCII characters here.  */
6841         continue;
6842       if (SINGLE_BYTE_CHAR_P (c))
6843         *single_byte_char_found = 1;
6844       /* Check the safe coding systems for C.  */
6845       ch = make_number (c);
6846       val = Faref (work_table, ch);
6847       if (EQ (val, Qt))
6848         /* This element was already checked.  Ignore it.  */
6849         continue;
6850       /* Remember that we checked this element.  */
6851       Faset (work_table, ch, Qt);
6852
6853       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6854         {
6855           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6856           int encodable;
6857
6858           elt = XCAR (tail);
6859           if (CONSP (XCDR (elt)))
6860             {
6861               /* This entry has this format now:
6862                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6863                           ACCEPT-LATIN-EXTRA ) */
6864               val = XCDR (elt);
6865               encodable = ! NILP (Faref (XCAR (val), ch));
6866               if (! encodable)
6867                 {
6868                   val = XCDR (val);
6869                   translation_table = XCAR (val);
6870                   hash_table = XCAR (XCDR (val));
6871                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6872                 }
6873             }
6874           else
6875             {
6876               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6877               encodable = ! NILP (Faref (XCDR (elt), ch));
6878               if (! encodable)
6879                 {
6880                   /* Transform the format to:
6881                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6882                        ACCEPT-LATIN-EXTRA )  */
6883                   val = Fget (XCAR (elt), Qcoding_system);
6884                   translation_table
6885                     = Fplist_get (AREF (val, 3),
6886                                   Qtranslation_table_for_encode);
6887                   if (SYMBOLP (translation_table))
6888                     translation_table = Fget (translation_table,
6889                                               Qtranslation_table);
6890                   hash_table
6891                     = (CHAR_TABLE_P (translation_table)
6892                        ? XCHAR_TABLE (translation_table)->extras[1]
6893                        : Qnil);
6894                   accept_latin_extra
6895                     = ((EQ (AREF (val, 0), make_number (2))
6896                         && VECTORP (AREF (val, 4)))
6897                        ? AREF (AREF (val, 4), 16)
6898                        : Qnil);
6899                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6900                                         translation_table, hash_table,
6901                                         accept_latin_extra));
6902                 }
6903             }
6904
6905           if (! encodable
6906               && ((CHAR_TABLE_P (translation_table)
6907                    && ! NILP (Faref (translation_table, ch)))
6908                   || (HASH_TABLE_P (hash_table)
6909                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6910                   || (SINGLE_BYTE_CHAR_P (c)
6911                       && ! NILP (accept_latin_extra)
6912                       && VECTORP (Vlatin_extra_code_table)
6913                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6914             encodable = 1;
6915           if (encodable)
6916             prev = tail;
6917           else
6918             {
6919               /* Exclude this coding system from SAFE_CODINGS.  */
6920               if (EQ (tail, safe_codings))
6921                 {
6922                   safe_codings = XCDR (safe_codings);
6923                   if (NILP (safe_codings))
6924                     goto done_safe_codings;
6925                 }
6926               else
6927                 XSETCDR (prev, XCDR (tail));
6928             }
6929         }
6930     }
6931
6932  done_safe_codings:
6933   /* If the above loop was terminated before P reaches PEND, it means
6934      SAFE_CODINGS was set to nil.  If we have not yet found an
6935      non-ASCII single-byte char, check it now.  */
6936   if (! *single_byte_char_found)
6937     while (p < pend)
6938       {
6939         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6940         p += len;
6941         if (! ASCII_BYTE_P (c)
6942             && SINGLE_BYTE_CHAR_P (c))
6943           {
6944             *single_byte_char_found = 1;
6945             break;
6946           }
6947       }
6948   return safe_codings;
6949 }
6950
6951 DEFUN ("find-coding-systems-region-internal",
6952        Ffind_coding_systems_region_internal,
6953        Sfind_coding_systems_region_internal, 2, 2, 0,
6954        doc: /* Internal use only.  */)
6955      (start, end)
6956      Lisp_Object start, end;
6957 {
6958   Lisp_Object work_table, safe_codings;
6959   int non_ascii_p = 0;
6960   int single_byte_char_found = 0;
6961   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6962
6963   if (STRINGP (start))
6964     {
6965       if (!STRING_MULTIBYTE (start))
6966         return Qt;
6967       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6968       p2 = p2end = p1end;
6969       if (SCHARS (start) != SBYTES (start))
6970         non_ascii_p = 1;
6971     }
6972   else
6973     {
6974       int from, to, stop;
6975
6976       CHECK_NUMBER_COERCE_MARKER (start);
6977       CHECK_NUMBER_COERCE_MARKER (end);
6978       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6979         args_out_of_range (start, end);
6980       if (NILP (current_buffer->enable_multibyte_characters))
6981         return Qt;
6982       from = CHAR_TO_BYTE (XINT (start));
6983       to = CHAR_TO_BYTE (XINT (end));
6984       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6985       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6986       if (stop == to)
6987         p2 = p2end = p1end;
6988       else
6989         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6990       if (XINT (end) - XINT (start) != to - from)
6991         non_ascii_p = 1;
6992     }
6993
6994   if (!non_ascii_p)
6995     {
6996       /* We are sure that the text contains no multibyte character.
6997          Check if it contains eight-bit-graphic.  */
6998       p = p1;
6999       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
7000       if (p == p1end)
7001         {
7002           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
7003           if (p == p2end)
7004             return Qt;
7005         }
7006     }
7007
7008   /* The text contains non-ASCII characters.  */
7009
7010   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
7011   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
7012
7013   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
7014                                     &single_byte_char_found);
7015   if (p2 < p2end)
7016     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
7017                                       &single_byte_char_found);
7018   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
7019     safe_codings = Qt;
7020   else
7021     {
7022       /* Turn safe_codings to a list of coding systems... */
7023       Lisp_Object val;
7024
7025       if (single_byte_char_found)
7026         /* ... and append these for eight-bit chars.  */
7027         val = Fcons (Qraw_text,
7028                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
7029       else
7030         /* ... and append generic coding systems.  */
7031         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
7032
7033       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
7034         val = Fcons (XCAR (XCAR (safe_codings)), val);
7035       safe_codings = val;
7036     }
7037
7038   return safe_codings;
7039 }
7040
7041
7042 /* Search from position POS for such characters that are unencodable
7043    accoding to SAFE_CHARS, and return a list of their positions.  P
7044    points where in the memory the character at POS exists.  Limit the
7045    search at PEND or when Nth unencodable characters are found.
7046
7047    If SAFE_CHARS is a char table, an element for an unencodable
7048    character is nil.
7049
7050    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
7051
7052    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
7053    eight-bit-graphic characters are unencodable.  */
7054
7055 static Lisp_Object
7056 unencodable_char_position (safe_chars, pos, p, pend, n)
7057      Lisp_Object safe_chars;
7058      int pos;
7059      unsigned char *p, *pend;
7060      int n;
7061 {
7062   Lisp_Object pos_list;
7063
7064   pos_list = Qnil;
7065   while (p < pend)
7066     {
7067       int len;
7068       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7069
7070       if (c >= 128
7071           && (CHAR_TABLE_P (safe_chars)
7072               ? NILP (CHAR_TABLE_REF (safe_chars, c))
7073               : (NILP (safe_chars) || c < 256)))
7074         {
7075           pos_list = Fcons (make_number (pos), pos_list);
7076           if (--n <= 0)
7077             break;
7078         }
7079       pos++;
7080       p += len;
7081     }
7082   return Fnreverse (pos_list);
7083 }
7084
7085
7086 DEFUN ("unencodable-char-position", Funencodable_char_position,
7087        Sunencodable_char_position, 3, 5, 0,
7088        doc: /*
7089 Return position of first un-encodable character in a region.
7090 START and END specfiy the region and CODING-SYSTEM specifies the
7091 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7092
7093 If optional 4th argument COUNT is non-nil, it specifies at most how
7094 many un-encodable characters to search.  In this case, the value is a
7095 list of positions.
7096
7097 If optional 5th argument STRING is non-nil, it is a string to search
7098 for un-encodable characters.  In that case, START and END are indexes
7099 to the string.  */)
7100      (start, end, coding_system, count, string)
7101      Lisp_Object start, end, coding_system, count, string;
7102 {
7103   int n;
7104   Lisp_Object safe_chars;
7105   struct coding_system coding;
7106   Lisp_Object positions;
7107   int from, to;
7108   unsigned char *p, *pend;
7109
7110   if (NILP (string))
7111     {
7112       validate_region (&start, &end);
7113       from = XINT (start);
7114       to = XINT (end);
7115       if (NILP (current_buffer->enable_multibyte_characters))
7116         return Qnil;
7117       p = CHAR_POS_ADDR (from);
7118       if (to == GPT)
7119         pend = GPT_ADDR;
7120       else
7121         pend = CHAR_POS_ADDR (to);
7122     }
7123   else
7124     {
7125       CHECK_STRING (string);
7126       CHECK_NATNUM (start);
7127       CHECK_NATNUM (end);
7128       from = XINT (start);
7129       to = XINT (end);
7130       if (from > to
7131           || to > SCHARS (string))
7132         args_out_of_range_3 (string, start, end);
7133       if (! STRING_MULTIBYTE (string))
7134         return Qnil;
7135       p = SDATA (string) + string_char_to_byte (string, from);
7136       pend = SDATA (string) + string_char_to_byte (string, to);
7137     }
7138
7139   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7140
7141   if (NILP (count))
7142     n = 1;
7143   else
7144     {
7145       CHECK_NATNUM (count);
7146       n = XINT (count);
7147     }
7148
7149   if (coding.type == coding_type_no_conversion
7150       || coding.type == coding_type_raw_text)
7151     return Qnil;
7152
7153   if (coding.type == coding_type_undecided)
7154     safe_chars = Qnil;
7155   else
7156     safe_chars = coding_safe_chars (coding_system);
7157
7158   if (STRINGP (string)
7159       || from >= GPT || to <= GPT)
7160     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7161   else
7162     {
7163       Lisp_Object args[2];
7164
7165       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7166       n -= XINT (Flength (args[0]));
7167       if (n <= 0)
7168         positions = args[0];
7169       else
7170         {
7171           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7172                                                pend, n);
7173           positions = Fappend (2, args);
7174         }
7175     }
7176
7177   return  (NILP (count) ? Fcar (positions) : positions);
7178 }
7179
7180
7181 Lisp_Object
7182 code_convert_region1 (start, end, coding_system, encodep)
7183      Lisp_Object start, end, coding_system;
7184      int encodep;
7185 {
7186   struct coding_system coding;
7187   int from, to;
7188
7189   CHECK_NUMBER_COERCE_MARKER (start);
7190   CHECK_NUMBER_COERCE_MARKER (end);
7191   CHECK_SYMBOL (coding_system);
7192
7193   validate_region (&start, &end);
7194   from = XFASTINT (start);
7195   to = XFASTINT (end);
7196
7197   if (NILP (coding_system))
7198     return make_number (to - from);
7199
7200   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7201     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7202
7203   coding.mode |= CODING_MODE_LAST_BLOCK;
7204   coding.src_multibyte = coding.dst_multibyte
7205     = !NILP (current_buffer->enable_multibyte_characters);
7206   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7207                        &coding, encodep, 1);
7208   Vlast_coding_system_used = coding.symbol;
7209   return make_number (coding.produced_char);
7210 }
7211
7212 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7213        3, 3, "r\nzCoding system: ",
7214        doc: /* Decode the current region from the specified coding system.
7215 When called from a program, takes three arguments:
7216 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7217 This function sets `last-coding-system-used' to the precise coding system
7218 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7219 not fully specified.)
7220 It returns the length of the decoded text.  */)
7221      (start, end, coding_system)
7222      Lisp_Object start, end, coding_system;
7223 {
7224   return code_convert_region1 (start, end, coding_system, 0);
7225 }
7226
7227 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7228        3, 3, "r\nzCoding system: ",
7229        doc: /* Encode the current region into the specified coding system.
7230 When called from a program, takes three arguments:
7231 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7232 This function sets `last-coding-system-used' to the precise coding system
7233 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7234 not fully specified.)
7235 It returns the length of the encoded text.  */)
7236      (start, end, coding_system)
7237      Lisp_Object start, end, coding_system;
7238 {
7239   return code_convert_region1 (start, end, coding_system, 1);
7240 }
7241
7242 Lisp_Object
7243 code_convert_string1 (string, coding_system, nocopy, encodep)
7244      Lisp_Object string, coding_system, nocopy;
7245      int encodep;
7246 {
7247   struct coding_system coding;
7248
7249   CHECK_STRING (string);
7250   CHECK_SYMBOL (coding_system);
7251
7252   if (NILP (coding_system))
7253     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7254
7255   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7256     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7257
7258   coding.mode |= CODING_MODE_LAST_BLOCK;
7259   string = (encodep
7260             ? encode_coding_string (string, &coding, !NILP (nocopy))
7261             : decode_coding_string (string, &coding, !NILP (nocopy)));
7262   Vlast_coding_system_used = coding.symbol;
7263
7264   return string;
7265 }
7266
7267 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7268        2, 3, 0,
7269        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7270 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7271 if the decoding operation is trivial.
7272 This function sets `last-coding-system-used' to the precise coding system
7273 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7274 not fully specified.)  */)
7275      (string, coding_system, nocopy)
7276      Lisp_Object string, coding_system, nocopy;
7277 {
7278   return code_convert_string1 (string, coding_system, nocopy, 0);
7279 }
7280
7281 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7282        2, 3, 0,
7283        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7284 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7285 if the encoding operation is trivial.
7286 This function sets `last-coding-system-used' to the precise coding system
7287 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7288 not fully specified.)  */)
7289      (string, coding_system, nocopy)
7290      Lisp_Object string, coding_system, nocopy;
7291 {
7292   return code_convert_string1 (string, coding_system, nocopy, 1);
7293 }
7294
7295 /* Encode or decode STRING according to CODING_SYSTEM.
7296    Do not set Vlast_coding_system_used.
7297
7298    This function is called only from macros DECODE_FILE and
7299    ENCODE_FILE, thus we ignore character composition.  */
7300
7301 Lisp_Object
7302 code_convert_string_norecord (string, coding_system, encodep)
7303      Lisp_Object string, coding_system;
7304      int encodep;
7305 {
7306   struct coding_system coding;
7307
7308   CHECK_STRING (string);
7309   CHECK_SYMBOL (coding_system);
7310
7311   if (NILP (coding_system))
7312     return string;
7313
7314   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7315     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7316
7317   coding.composing = COMPOSITION_DISABLED;
7318   coding.mode |= CODING_MODE_LAST_BLOCK;
7319   return (encodep
7320           ? encode_coding_string (string, &coding, 1)
7321           : decode_coding_string (string, &coding, 1));
7322 }
7323 \f
7324 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7325        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7326 Return the corresponding character.  */)
7327      (code)
7328      Lisp_Object code;
7329 {
7330   unsigned char c1, c2, s1, s2;
7331   Lisp_Object val;
7332
7333   CHECK_NUMBER (code);
7334   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7335   if (s1 == 0)
7336     {
7337       if (s2 < 0x80)
7338         XSETFASTINT (val, s2);
7339       else if (s2 >= 0xA0 || s2 <= 0xDF)
7340         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7341       else
7342         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7343     }
7344   else
7345     {
7346       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7347           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7348         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7349       DECODE_SJIS (s1, s2, c1, c2);
7350       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7351     }
7352   return val;
7353 }
7354
7355 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7356        doc: /* Encode a Japanese character CH to shift_jis encoding.
7357 Return the corresponding code in SJIS.  */)
7358      (ch)
7359      Lisp_Object ch;
7360 {
7361   int charset, c1, c2, s1, s2;
7362   Lisp_Object val;
7363
7364   CHECK_NUMBER (ch);
7365   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7366   if (charset == CHARSET_ASCII)
7367     {
7368       val = ch;
7369     }
7370   else if (charset == charset_jisx0208
7371            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7372     {
7373       ENCODE_SJIS (c1, c2, s1, s2);
7374       XSETFASTINT (val, (s1 << 8) | s2);
7375     }
7376   else if (charset == charset_katakana_jisx0201
7377            && c1 > 0x20 && c2 < 0xE0)
7378     {
7379       XSETFASTINT (val, c1 | 0x80);
7380     }
7381   else
7382     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7383   return val;
7384 }
7385
7386 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7387        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7388 Return the corresponding character.  */)
7389      (code)
7390      Lisp_Object code;
7391 {
7392   int charset;
7393   unsigned char b1, b2, c1, c2;
7394   Lisp_Object val;
7395
7396   CHECK_NUMBER (code);
7397   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7398   if (b1 == 0)
7399     {
7400       if (b2 >= 0x80)
7401         error ("Invalid BIG5 code: %x", XFASTINT (code));
7402       val = code;
7403     }
7404   else
7405     {
7406       if ((b1 < 0xA1 || b1 > 0xFE)
7407           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7408         error ("Invalid BIG5 code: %x", XFASTINT (code));
7409       DECODE_BIG5 (b1, b2, charset, c1, c2);
7410       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7411     }
7412   return val;
7413 }
7414
7415 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7416        doc: /* Encode the Big5 character CH to BIG5 coding system.
7417 Return the corresponding character code in Big5.  */)
7418      (ch)
7419      Lisp_Object ch;
7420 {
7421   int charset, c1, c2, b1, b2;
7422   Lisp_Object val;
7423
7424   CHECK_NUMBER (ch);
7425   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7426   if (charset == CHARSET_ASCII)
7427     {
7428       val = ch;
7429     }
7430   else if ((charset == charset_big5_1
7431             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7432            || (charset == charset_big5_2
7433                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7434     {
7435       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7436       XSETFASTINT (val, (b1 << 8) | b2);
7437     }
7438   else
7439     error ("Can't encode to Big5: %d", XFASTINT (ch));
7440   return val;
7441 }
7442 \f
7443 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7444        Sset_terminal_coding_system_internal, 1, 2, 0,
7445        doc: /* Internal use only.  */)
7446      (coding_system, terminal)
7447      Lisp_Object coding_system;
7448      Lisp_Object terminal;
7449 {
7450   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
7451   CHECK_SYMBOL (coding_system);
7452   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7453   /* We had better not send unsafe characters to terminal.  */
7454   terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7455   /* Character composition should be disabled.  */
7456   terminal_coding->composing = COMPOSITION_DISABLED;
7457   /* Error notification should be suppressed.  */
7458   terminal_coding->suppress_error = 1;
7459   terminal_coding->src_multibyte = 1;
7460   terminal_coding->dst_multibyte = 0;
7461   return Qnil;
7462 }
7463
7464 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7465        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7466        doc: /* Internal use only.  */)
7467      (coding_system)
7468      Lisp_Object coding_system;
7469 {
7470   CHECK_SYMBOL (coding_system);
7471   setup_coding_system (Fcheck_coding_system (coding_system),
7472                        &safe_terminal_coding);
7473   /* Character composition should be disabled.  */
7474   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7475   /* Error notification should be suppressed.  */
7476   safe_terminal_coding.suppress_error = 1;
7477   safe_terminal_coding.src_multibyte = 1;
7478   safe_terminal_coding.dst_multibyte = 0;
7479   return Qnil;
7480 }
7481
7482 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7483        Sterminal_coding_system, 0, 1, 0,
7484        doc: /* Return coding system specified for terminal output on the given terminal.
7485 TERMINAL may be a terminal id, a frame, or nil for the selected
7486 frame's terminal device.  */)
7487      (terminal)
7488      Lisp_Object terminal;
7489 {
7490   return TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1))->symbol;
7491 }
7492
7493 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7494        Sset_keyboard_coding_system_internal, 1, 2, 0,
7495        doc: /* Internal use only.  */)
7496      (coding_system, terminal)
7497      Lisp_Object coding_system;
7498      Lisp_Object terminal;
7499 {
7500   struct terminal *t = get_terminal (terminal, 1);
7501   CHECK_SYMBOL (coding_system);
7502
7503   setup_coding_system (Fcheck_coding_system (coding_system),
7504                        TERMINAL_KEYBOARD_CODING (t));
7505   /* Character composition should be disabled.  */
7506   TERMINAL_KEYBOARD_CODING (t)->composing = COMPOSITION_DISABLED;
7507   return Qnil;
7508 }
7509
7510 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7511        Skeyboard_coding_system, 0, 1, 0,
7512        doc: /* Return coding system for decoding keyboard input on TERMINAL.
7513 TERMINAL may be a terminal id, a frame, or nil for the selected
7514 frame's terminal device.  */)
7515      (terminal)
7516      Lisp_Object terminal;
7517 {
7518   return TERMINAL_KEYBOARD_CODING (get_terminal (terminal, 1))->symbol;
7519 }
7520
7521 \f
7522 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7523        Sfind_operation_coding_system,  1, MANY, 0,
7524        doc: /* Choose a coding system for an operation based on the target name.
7525 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7526 DECODING-SYSTEM is the coding system to use for decoding
7527 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7528 for encoding (in case OPERATION does encoding).
7529
7530 The first argument OPERATION specifies an I/O primitive:
7531   For file I/O, `insert-file-contents' or `write-region'.
7532   For process I/O, `call-process', `call-process-region', or `start-process'.
7533   For network I/O, `open-network-stream'.
7534
7535 The remaining arguments should be the same arguments that were passed
7536 to the primitive.  Depending on which primitive, one of those arguments
7537 is selected as the TARGET.  For example, if OPERATION does file I/O,
7538 whichever argument specifies the file name is TARGET.
7539
7540 TARGET has a meaning which depends on OPERATION:
7541   For file I/O, TARGET is a file name (except for the special case below).
7542   For process I/O, TARGET is a process name.
7543   For network I/O, TARGET is a service name or a port number
7544
7545 This function looks up what specified for TARGET in,
7546 `file-coding-system-alist', `process-coding-system-alist',
7547 or `network-coding-system-alist' depending on OPERATION.
7548 They may specify a coding system, a cons of coding systems,
7549 or a function symbol to call.
7550 In the last case, we call the function with one argument,
7551 which is a list of all the arguments given to this function.
7552 If the function can't decide a coding system, it can return
7553 `undecided' so that the normal code-detection is performed.
7554
7555 If OPERATION is `insert-file-contents', the argument corresponding to
7556 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
7557 file name to look up, and BUFFER is a buffer that contains the file's
7558 contents (not yet decoded).  If `file-coding-system-alist' specifies a
7559 function to call for FILENAME, that function should examine the
7560 contents of BUFFER instead of reading the file.
7561
7562 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
7563      (nargs, args)
7564      int nargs;
7565      Lisp_Object *args;
7566 {
7567   Lisp_Object operation, target_idx, target, val;
7568   register Lisp_Object chain;
7569
7570   if (nargs < 2)
7571     error ("Too few arguments");
7572   operation = args[0];
7573   if (!SYMBOLP (operation)
7574       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7575     error ("Invalid first argument");
7576   if (nargs < 1 + XINT (target_idx))
7577     error ("Too few arguments for operation: %s",
7578            SDATA (SYMBOL_NAME (operation)));
7579   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7580      argument to write-region) is string, it must be treated as a
7581      target file name.  */
7582   if (EQ (operation, Qwrite_region)
7583       && nargs > 5
7584       && STRINGP (args[5]))
7585     target_idx = make_number (4);
7586   target = args[XINT (target_idx) + 1];
7587   if (!(STRINGP (target)
7588         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7589             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7590         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7591     error ("Invalid argument %d", XINT (target_idx) + 1);
7592   if (CONSP (target))
7593     target = XCAR (target);
7594
7595   chain = ((EQ (operation, Qinsert_file_contents)
7596             || EQ (operation, Qwrite_region))
7597            ? Vfile_coding_system_alist
7598            : (EQ (operation, Qopen_network_stream)
7599               ? Vnetwork_coding_system_alist
7600               : Vprocess_coding_system_alist));
7601   if (NILP (chain))
7602     return Qnil;
7603
7604   for (; CONSP (chain); chain = XCDR (chain))
7605     {
7606       Lisp_Object elt;
7607       elt = XCAR (chain);
7608
7609       if (CONSP (elt)
7610           && ((STRINGP (target)
7611                && STRINGP (XCAR (elt))
7612                && fast_string_match (XCAR (elt), target) >= 0)
7613               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7614         {
7615           val = XCDR (elt);
7616           /* Here, if VAL is both a valid coding system and a valid
7617              function symbol, we return VAL as a coding system.  */
7618           if (CONSP (val))
7619             return val;
7620           if (! SYMBOLP (val))
7621             return Qnil;
7622           if (! NILP (Fcoding_system_p (val)))
7623             return Fcons (val, val);
7624           if (! NILP (Ffboundp (val)))
7625             {
7626               /* We use call1 rather than safe_call1
7627                  so as to get bug reports about functions called here
7628                  which don't handle the current interface.  */
7629               val = call1 (val, Flist (nargs, args));
7630               if (CONSP (val))
7631                 return val;
7632               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7633                 return Fcons (val, val);
7634             }
7635           return Qnil;
7636         }
7637     }
7638   return Qnil;
7639 }
7640
7641 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7642        Supdate_coding_systems_internal, 0, 0, 0,
7643        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7644 When values of any coding categories are changed, you must
7645 call this function.  */)
7646      ()
7647 {
7648   int i;
7649
7650   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7651     {
7652       Lisp_Object val;
7653
7654       val = find_symbol_value (XVECTOR (Vcoding_category_table)->contents[i]);
7655       if (!NILP (val))
7656         {
7657           if (! coding_system_table[i])
7658             coding_system_table[i] = ((struct coding_system *)
7659                                       xmalloc (sizeof (struct coding_system)));
7660           setup_coding_system (val, coding_system_table[i]);
7661         }
7662       else if (coding_system_table[i])
7663         {
7664           xfree (coding_system_table[i]);
7665           coding_system_table[i] = NULL;
7666         }
7667     }
7668
7669   return Qnil;
7670 }
7671
7672 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7673        Sset_coding_priority_internal, 0, 0, 0,
7674        doc: /* Update internal database for the current value of `coding-category-list'.
7675 This function is internal use only.  */)
7676      ()
7677 {
7678   int i = 0, idx;
7679   Lisp_Object val;
7680
7681   val = Vcoding_category_list;
7682
7683   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7684     {
7685       if (! SYMBOLP (XCAR (val)))
7686         break;
7687       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7688       if (idx >= CODING_CATEGORY_IDX_MAX)
7689         break;
7690       coding_priorities[i++] = (1 << idx);
7691       val = XCDR (val);
7692     }
7693   /* If coding-category-list is valid and contains all coding
7694      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7695      the following code saves Emacs from crashing.  */
7696   while (i < CODING_CATEGORY_IDX_MAX)
7697     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7698
7699   return Qnil;
7700 }
7701
7702 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7703        Sdefine_coding_system_internal, 1, 1, 0,
7704        doc: /* Register CODING-SYSTEM as a base coding system.
7705 This function is internal use only.  */)
7706      (coding_system)
7707      Lisp_Object coding_system;
7708 {
7709   Lisp_Object safe_chars, slot;
7710
7711   if (NILP (Fcheck_coding_system (coding_system)))
7712     xsignal1 (Qcoding_system_error, coding_system);
7713
7714   safe_chars = coding_safe_chars (coding_system);
7715   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7716     error ("No valid safe-chars property for %s",
7717            SDATA (SYMBOL_NAME (coding_system)));
7718
7719   if (EQ (safe_chars, Qt))
7720     {
7721       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7722         XSETCAR (Vcoding_system_safe_chars,
7723                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7724     }
7725   else
7726     {
7727       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7728       if (NILP (slot))
7729         XSETCDR (Vcoding_system_safe_chars,
7730                  nconc2 (XCDR (Vcoding_system_safe_chars),
7731                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7732       else
7733         XSETCDR (slot, safe_chars);
7734     }
7735   return Qnil;
7736 }
7737
7738 #endif /* emacs */
7739
7740 \f
7741 /*** 9. Post-amble ***/
7742
7743 void
7744 init_coding_once ()
7745 {
7746   int i;
7747
7748   /* Emacs' internal format specific initialize routine.  */
7749   for (i = 0; i <= 0x20; i++)
7750     emacs_code_class[i] = EMACS_control_code;
7751   emacs_code_class[0x0A] = EMACS_linefeed_code;
7752   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7753   for (i = 0x21 ; i < 0x7F; i++)
7754     emacs_code_class[i] = EMACS_ascii_code;
7755   emacs_code_class[0x7F] = EMACS_control_code;
7756   for (i = 0x80; i < 0xFF; i++)
7757     emacs_code_class[i] = EMACS_invalid_code;
7758   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7759   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7760   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7761   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7762
7763   /* ISO2022 specific initialize routine.  */
7764   for (i = 0; i < 0x20; i++)
7765     iso_code_class[i] = ISO_control_0;
7766   for (i = 0x21; i < 0x7F; i++)
7767     iso_code_class[i] = ISO_graphic_plane_0;
7768   for (i = 0x80; i < 0xA0; i++)
7769     iso_code_class[i] = ISO_control_1;
7770   for (i = 0xA1; i < 0xFF; i++)
7771     iso_code_class[i] = ISO_graphic_plane_1;
7772   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7773   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7774   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7775   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7776   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7777   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7778   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7779   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7780   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7781   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7782
7783   setup_coding_system (Qnil, &safe_terminal_coding);
7784   setup_coding_system (Qnil, &default_buffer_file_coding);
7785
7786   bzero (coding_system_table, sizeof coding_system_table);
7787
7788   bzero (ascii_skip_code, sizeof ascii_skip_code);
7789   for (i = 0; i < 128; i++)
7790     ascii_skip_code[i] = 1;
7791
7792 #if defined (MSDOS) || defined (WINDOWSNT)
7793   system_eol_type = CODING_EOL_CRLF;
7794 #else
7795   system_eol_type = CODING_EOL_LF;
7796 #endif
7797
7798   inhibit_pre_post_conversion = 0;
7799 }
7800
7801 #ifdef emacs
7802
7803 void
7804 syms_of_coding ()
7805 {
7806   staticpro (&Vcode_conversion_workbuf_name);
7807   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7808
7809   Qtarget_idx = intern ("target-idx");
7810   staticpro (&Qtarget_idx);
7811
7812   Qcoding_system_history = intern ("coding-system-history");
7813   staticpro (&Qcoding_system_history);
7814   Fset (Qcoding_system_history, Qnil);
7815
7816   /* Target FILENAME is the first argument.  */
7817   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7818   /* Target FILENAME is the third argument.  */
7819   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7820
7821   Qcall_process = intern ("call-process");
7822   staticpro (&Qcall_process);
7823   /* Target PROGRAM is the first argument.  */
7824   Fput (Qcall_process, Qtarget_idx, make_number (0));
7825
7826   Qcall_process_region = intern ("call-process-region");
7827   staticpro (&Qcall_process_region);
7828   /* Target PROGRAM is the third argument.  */
7829   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7830
7831   Qstart_process = intern ("start-process");
7832   staticpro (&Qstart_process);
7833   /* Target PROGRAM is the third argument.  */
7834   Fput (Qstart_process, Qtarget_idx, make_number (2));
7835
7836   Qopen_network_stream = intern ("open-network-stream");
7837   staticpro (&Qopen_network_stream);
7838   /* Target SERVICE is the fourth argument.  */
7839   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7840
7841   Qcoding_system = intern ("coding-system");
7842   staticpro (&Qcoding_system);
7843
7844   Qeol_type = intern ("eol-type");
7845   staticpro (&Qeol_type);
7846
7847   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7848   staticpro (&Qbuffer_file_coding_system);
7849
7850   Qpost_read_conversion = intern ("post-read-conversion");
7851   staticpro (&Qpost_read_conversion);
7852
7853   Qpre_write_conversion = intern ("pre-write-conversion");
7854   staticpro (&Qpre_write_conversion);
7855
7856   Qno_conversion = intern ("no-conversion");
7857   staticpro (&Qno_conversion);
7858
7859   Qundecided = intern ("undecided");
7860   staticpro (&Qundecided);
7861
7862   Qcoding_system_p = intern ("coding-system-p");
7863   staticpro (&Qcoding_system_p);
7864
7865   Qcoding_system_error = intern ("coding-system-error");
7866   staticpro (&Qcoding_system_error);
7867
7868   Fput (Qcoding_system_error, Qerror_conditions,
7869         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7870   Fput (Qcoding_system_error, Qerror_message,
7871         build_string ("Invalid coding system"));
7872
7873   Qcoding_category = intern ("coding-category");
7874   staticpro (&Qcoding_category);
7875   Qcoding_category_index = intern ("coding-category-index");
7876   staticpro (&Qcoding_category_index);
7877
7878   Vcoding_category_table
7879     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7880   staticpro (&Vcoding_category_table);
7881   {
7882     int i;
7883     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7884       {
7885         XVECTOR (Vcoding_category_table)->contents[i]
7886           = intern (coding_category_name[i]);
7887         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7888               Qcoding_category_index, make_number (i));
7889       }
7890   }
7891
7892   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7893   staticpro (&Vcoding_system_safe_chars);
7894
7895   Qtranslation_table = intern ("translation-table");
7896   staticpro (&Qtranslation_table);
7897   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7898
7899   Qtranslation_table_id = intern ("translation-table-id");
7900   staticpro (&Qtranslation_table_id);
7901
7902   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7903   staticpro (&Qtranslation_table_for_decode);
7904
7905   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7906   staticpro (&Qtranslation_table_for_encode);
7907
7908   Qsafe_chars = intern ("safe-chars");
7909   staticpro (&Qsafe_chars);
7910
7911   Qchar_coding_system = intern ("char-coding-system");
7912   staticpro (&Qchar_coding_system);
7913
7914   /* Intern this now in case it isn't already done.
7915      Setting this variable twice is harmless.
7916      But don't staticpro it here--that is done in alloc.c.  */
7917   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7918   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7919   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7920
7921   Qvalid_codes = intern ("valid-codes");
7922   staticpro (&Qvalid_codes);
7923
7924   Qascii_incompatible = intern ("ascii-incompatible");
7925   staticpro (&Qascii_incompatible);
7926
7927   Qemacs_mule = intern ("emacs-mule");
7928   staticpro (&Qemacs_mule);
7929
7930   Qraw_text = intern ("raw-text");
7931   staticpro (&Qraw_text);
7932
7933   Qutf_8 = intern ("utf-8");
7934   staticpro (&Qutf_8);
7935
7936   Qcoding_system_define_form = intern ("coding-system-define-form");
7937   staticpro (&Qcoding_system_define_form);
7938
7939   defsubr (&Scoding_system_p);
7940   defsubr (&Sread_coding_system);
7941   defsubr (&Sread_non_nil_coding_system);
7942   defsubr (&Scheck_coding_system);
7943   defsubr (&Sdetect_coding_region);
7944   defsubr (&Sdetect_coding_string);
7945   defsubr (&Sfind_coding_systems_region_internal);
7946   defsubr (&Sunencodable_char_position);
7947   defsubr (&Sdecode_coding_region);
7948   defsubr (&Sencode_coding_region);
7949   defsubr (&Sdecode_coding_string);
7950   defsubr (&Sencode_coding_string);
7951   defsubr (&Sdecode_sjis_char);
7952   defsubr (&Sencode_sjis_char);
7953   defsubr (&Sdecode_big5_char);
7954   defsubr (&Sencode_big5_char);
7955   defsubr (&Sset_terminal_coding_system_internal);
7956   defsubr (&Sset_safe_terminal_coding_system_internal);
7957   defsubr (&Sterminal_coding_system);
7958   defsubr (&Sset_keyboard_coding_system_internal);
7959   defsubr (&Skeyboard_coding_system);
7960   defsubr (&Sfind_operation_coding_system);
7961   defsubr (&Supdate_coding_systems_internal);
7962   defsubr (&Sset_coding_priority_internal);
7963   defsubr (&Sdefine_coding_system_internal);
7964
7965   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7966                doc: /* List of coding systems.
7967
7968 Do not alter the value of this variable manually.  This variable should be
7969 updated by the functions `make-coding-system' and
7970 `define-coding-system-alias'.  */);
7971   Vcoding_system_list = Qnil;
7972
7973   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7974                doc: /* Alist of coding system names.
7975 Each element is one element list of coding system name.
7976 This variable is given to `completing-read' as TABLE argument.
7977
7978 Do not alter the value of this variable manually.  This variable should be
7979 updated by the functions `make-coding-system' and
7980 `define-coding-system-alias'.  */);
7981   Vcoding_system_alist = Qnil;
7982
7983   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7984                doc: /* List of coding-categories (symbols) ordered by priority.
7985
7986 On detecting a coding system, Emacs tries code detection algorithms
7987 associated with each coding-category one by one in this order.  When
7988 one algorithm agrees with a byte sequence of source text, the coding
7989 system bound to the corresponding coding-category is selected.
7990
7991 Don't modify this variable directly, but use `set-coding-priority'.  */);
7992   {
7993     int i;
7994
7995     Vcoding_category_list = Qnil;
7996     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7997       Vcoding_category_list
7998         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7999                  Vcoding_category_list);
8000   }
8001
8002   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8003                doc: /* Specify the coding system for read operations.
8004 It is useful to bind this variable with `let', but do not set it globally.
8005 If the value is a coding system, it is used for decoding on read operation.
8006 If not, an appropriate element is used from one of the coding system alists:
8007 There are three such tables, `file-coding-system-alist',
8008 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8009   Vcoding_system_for_read = Qnil;
8010
8011   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8012                doc: /* Specify the coding system for write operations.
8013 Programs bind this variable with `let', but you should not set it globally.
8014 If the value is a coding system, it is used for encoding of output,
8015 when writing it to a file and when sending it to a file or subprocess.
8016
8017 If this does not specify a coding system, an appropriate element
8018 is used from one of the coding system alists:
8019 There are three such tables, `file-coding-system-alist',
8020 `process-coding-system-alist', and `network-coding-system-alist'.
8021 For output to files, if the above procedure does not specify a coding system,
8022 the value of `buffer-file-coding-system' is used.  */);
8023   Vcoding_system_for_write = Qnil;
8024
8025   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8026                doc: /* Coding system used in the latest file or process I/O.
8027 Also set by `encode-coding-region', `decode-coding-region',
8028 `encode-coding-string' and `decode-coding-string'.  */);
8029   Vlast_coding_system_used = Qnil;
8030
8031   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8032                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
8033 See info node `Coding Systems' and info node `Text and Binary' concerning
8034 such conversion.  */);
8035   inhibit_eol_conversion = 0;
8036
8037   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8038                doc: /* Non-nil means process buffer inherits coding system of process output.
8039 Bind it to t if the process output is to be treated as if it were a file
8040 read from some filesystem.  */);
8041   inherit_process_coding_system = 0;
8042
8043   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8044                doc: /* Alist to decide a coding system to use for a file I/O operation.
8045 The format is ((PATTERN . VAL) ...),
8046 where PATTERN is a regular expression matching a file name,
8047 VAL is a coding system, a cons of coding systems, or a function symbol.
8048 If VAL is a coding system, it is used for both decoding and encoding
8049 the file contents.
8050 If VAL is a cons of coding systems, the car part is used for decoding,
8051 and the cdr part is used for encoding.
8052 If VAL is a function symbol, the function must return a coding system
8053 or a cons of coding systems which are used as above.  The function is
8054 called with an argument that is a list of the arguments with which
8055 `find-operation-coding-system' was called.  If the function can't decide
8056 a coding system, it can return `undecided' so that the normal
8057 code-detection is performed.
8058
8059 See also the function `find-operation-coding-system'
8060 and the variable `auto-coding-alist'.  */);
8061   Vfile_coding_system_alist = Qnil;
8062
8063   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8064     doc: /* Alist to decide a coding system to use for a process I/O operation.
8065 The format is ((PATTERN . VAL) ...),
8066 where PATTERN is a regular expression matching a program name,
8067 VAL is a coding system, a cons of coding systems, or a function symbol.
8068 If VAL is a coding system, it is used for both decoding what received
8069 from the program and encoding what sent to the program.
8070 If VAL is a cons of coding systems, the car part is used for decoding,
8071 and the cdr part is used for encoding.
8072 If VAL is a function symbol, the function must return a coding system
8073 or a cons of coding systems which are used as above.
8074
8075 See also the function `find-operation-coding-system'.  */);
8076   Vprocess_coding_system_alist = Qnil;
8077
8078   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8079     doc: /* Alist to decide a coding system to use for a network I/O operation.
8080 The format is ((PATTERN . VAL) ...),
8081 where PATTERN is a regular expression matching a network service name
8082 or is a port number to connect to,
8083 VAL is a coding system, a cons of coding systems, or a function symbol.
8084 If VAL is a coding system, it is used for both decoding what received
8085 from the network stream and encoding what sent to the network stream.
8086 If VAL is a cons of coding systems, the car part is used for decoding,
8087 and the cdr part is used for encoding.
8088 If VAL is a function symbol, the function must return a coding system
8089 or a cons of coding systems which are used as above.
8090
8091 See also the function `find-operation-coding-system'.  */);
8092   Vnetwork_coding_system_alist = Qnil;
8093
8094   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8095                doc: /* Coding system to use with system messages.
8096 Also used for decoding keyboard input on X Window system.  */);
8097   Vlocale_coding_system = Qnil;
8098
8099   /* The eol mnemonics are reset in startup.el system-dependently.  */
8100   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8101                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8102   eol_mnemonic_unix = build_string (":");
8103
8104   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8105                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8106   eol_mnemonic_dos = build_string ("\\");
8107
8108   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8109                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8110   eol_mnemonic_mac = build_string ("/");
8111
8112   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8113                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8114   eol_mnemonic_undecided = build_string (":");
8115
8116   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8117                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8118   Venable_character_translation = Qt;
8119
8120   DEFVAR_LISP ("standard-translation-table-for-decode",
8121                &Vstandard_translation_table_for_decode,
8122                doc: /* Table for translating characters while decoding.  */);
8123   Vstandard_translation_table_for_decode = Qnil;
8124
8125   DEFVAR_LISP ("standard-translation-table-for-encode",
8126                &Vstandard_translation_table_for_encode,
8127                doc: /* Table for translating characters while encoding.  */);
8128   Vstandard_translation_table_for_encode = Qnil;
8129
8130   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8131                doc: /* Alist of charsets vs revision numbers.
8132 While encoding, if a charset (car part of an element) is found,
8133 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8134   Vcharset_revision_alist = Qnil;
8135
8136   DEFVAR_LISP ("default-process-coding-system",
8137                &Vdefault_process_coding_system,
8138                doc: /* Cons of coding systems used for process I/O by default.
8139 The car part is used for decoding a process output,
8140 the cdr part is used for encoding a text to be sent to a process.  */);
8141   Vdefault_process_coding_system = Qnil;
8142
8143   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8144                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8145 This is a vector of length 256.
8146 If Nth element is non-nil, the existence of code N in a file
8147 \(or output of subprocess) doesn't prevent it to be detected as
8148 a coding system of ISO 2022 variant which has a flag
8149 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8150 or reading output of a subprocess.
8151 Only 128th through 159th elements has a meaning.  */);
8152   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8153
8154   DEFVAR_LISP ("select-safe-coding-system-function",
8155                &Vselect_safe_coding_system_function,
8156                doc: /* Function to call to select safe coding system for encoding a text.
8157
8158 If set, this function is called to force a user to select a proper
8159 coding system which can encode the text in the case that a default
8160 coding system used in each operation can't encode the text.  The
8161 function should take care that the buffer is not modified while
8162 the coding system is being selected.
8163
8164 The default value is `select-safe-coding-system' (which see).  */);
8165   Vselect_safe_coding_system_function = Qnil;
8166
8167   DEFVAR_BOOL ("coding-system-require-warning",
8168                &coding_system_require_warning,
8169                doc: /* Internal use only.
8170 If non-nil, on writing a file, `select-safe-coding-system-function' is
8171 called even if `coding-system-for-write' is non-nil.  The command
8172 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8173   coding_system_require_warning = 0;
8174
8175
8176   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8177                &inhibit_iso_escape_detection,
8178                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8179
8180 By default, on reading a file, Emacs tries to detect how the text is
8181 encoded.  This code detection is sensitive to escape sequences.  If
8182 the sequence is valid as ISO2022, the code is determined as one of
8183 the ISO2022 encodings, and the file is decoded by the corresponding
8184 coding system (e.g. `iso-2022-7bit').
8185
8186 However, there may be a case that you want to read escape sequences in
8187 a file as is.  In such a case, you can set this variable to non-nil.
8188 Then, as the code detection ignores any escape sequences, no file is
8189 detected as encoded in some ISO2022 encoding.  The result is that all
8190 escape sequences become visible in a buffer.
8191
8192 The default value is nil, and it is strongly recommended not to change
8193 it.  That is because many Emacs Lisp source files that contain
8194 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8195 in Emacs's distribution, and they won't be decoded correctly on
8196 reading if you suppress escape sequence detection.
8197
8198 The other way to read escape sequences in a file without decoding is
8199 to explicitly specify some coding system that doesn't use ISO2022's
8200 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8201   inhibit_iso_escape_detection = 0;
8202
8203   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8204                doc: /* Char table for translating self-inserting characters.
8205 This is applied to the result of input methods, not their input.  See also
8206 `keyboard-translate-table'.  */);
8207     Vtranslation_table_for_input = Qnil;
8208 }
8209
8210 char *
8211 emacs_strerror (error_number)
8212      int error_number;
8213 {
8214   char *str;
8215
8216   synchronize_system_messages_locale ();
8217   str = strerror (error_number);
8218
8219   if (! NILP (Vlocale_coding_system))
8220     {
8221       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8222                                                       Vlocale_coding_system,
8223                                                       0);
8224       str = (char *) SDATA (dec);
8225     }
8226
8227   return str;
8228 }
8229
8230 #endif /* emacs */
8231
8232 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8233    (do not change this comment) */