src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 3, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  24 Boston, MA 02110-1301, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-mule) handlers
  31   3. ISO2022 handlers
  32   4. Shift-JIS and BIG5 handlers
  33   5. CCL handlers
  34   6. End-of-line handlers
  35   7. C library functions
  36   8. Emacs Lisp library functions
  37   9. Post-amble
  38
  39 */
  40
  41 /*** 0. General comments ***/
  42
  43
  44 /*** GENERAL NOTE on CODING SYSTEMS ***
  45
  46   A coding system is an encoding mechanism for one or more character
  47   sets.  Here's a list of coding systems which Emacs can handle.  When
  48   we say "decode", it means converting some other coding system to
  49   Emacs' internal format (emacs-mule), and when we say "encode",
  50   it means converting the coding system emacs-mule to some other
  51   coding system.
  52
  53   0. Emacs' internal format (emacs-mule)
  54
  55   Emacs itself holds a multi-lingual character in buffers and strings
  56   in a special format.  Details are described in section 2.
  57
  58   1. ISO2022
  59
  60   The most famous coding system for multiple character sets.  X's
  61   Compound Text, various EUCs (Extended Unix Code), and coding
  62   systems used in Internet communication such as ISO-2022-JP are
  63   all variants of ISO2022.  Details are described in section 3.
  64
  65   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  66
  67   A coding system to encode character sets: ASCII, JISX0201, and
  68   JISX0208.  Widely used for PC's in Japan.  Details are described in
  69   section 4.
  70
  71   3. BIG5
  72
  73   A coding system to encode the character sets ASCII and Big5.  Widely
  74   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  75   described in section 4.  In this file, when we write "BIG5"
  76   (all uppercase), we mean the coding system, and when we write
  77   "Big5" (capitalized), we mean the character set.
  78
  79   4. Raw text
  80
  81   A coding system for text containing random 8-bit code.  Emacs does
  82   no code conversion on such text except for end-of-line format.
  83
  84   5. Other
  85
  86   If a user wants to read/write text encoded in a coding system not
  87   listed above, he can supply a decoder and an encoder for it as CCL
  88   (Code Conversion Language) programs.  Emacs executes the CCL program
  89   while reading/writing.
  90
  91   Emacs represents a coding system by a Lisp symbol that has a property
  92   `coding-system'.  But, before actually using the coding system, the
  93   information about it is set in a structure of type `struct
  94   coding_system' for rapid processing.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  99
 100   How end-of-line of text is encoded depends on the operating system.
 101   For instance, Unix's format is just one byte of `line-feed' code,
 102   whereas DOS's format is two-byte sequence of `carriage-return' and
 103   `line-feed' codes.  MacOS's format is usually one byte of
 104   `carriage-return'.
 105
 106   Since text character encoding and end-of-line encoding are
 107   independent, any coding system described above can have any
 108   end-of-line format.  So Emacs has information about end-of-line
 109   format in each coding-system.  See section 6 for more details.
 110
 111 */
 112
 113 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 114
 115   These functions check if a text between SRC and SRC_END is encoded
 116   in the coding system category XXX.  Each returns an integer value in
 117   which appropriate flag bits for the category XXX are set.  The flag
 118   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 119   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 120   of the range 0x80..0x9F are in multibyte form.  */
 121 #if 0
 122 int
 123 detect_coding_emacs_mule (src, src_end, multibytep)
 124      unsigned char *src, *src_end;
 125      int multibytep;
 126 {
 127   ...
 128 }
 129 #endif
 130
 131 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 132
 133   These functions decode SRC_BYTES length of unibyte text at SOURCE
 134   encoded in CODING to Emacs' internal format.  The resulting
 135   multibyte text goes to a place pointed to by DESTINATION, the length
 136   of which should not exceed DST_BYTES.
 137
 138   These functions set the information about original and decoded texts
 139   in the members `produced', `produced_char', `consumed', and
 140   `consumed_char' of the structure *CODING.  They also set the member
 141   `result' to one of CODING_FINISH_XXX indicating how the decoding
 142   finished.
 143
 144   DST_BYTES zero means that the source area and destination area are
 145   overlapped, which means that we can produce a decoded text until it
 146   reaches the head of the not-yet-decoded source text.
 147
 148   Below is a template for these functions.  */
 149 #if 0
 150 static void
 151 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 152      struct coding_system *coding;
 153      const unsigned char *source;
 154      unsigned char *destination;
 155      int src_bytes, dst_bytes;
 156 {
 157   ...
 158 }
 159 #endif
 160
 161 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 162
 163   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 164   internal multibyte format to CODING.  The resulting unibyte text
 165   goes to a place pointed to by DESTINATION, the length of which
 166   should not exceed DST_BYTES.
 167
 168   These functions set the information about original and encoded texts
 169   in the members `produced', `produced_char', `consumed', and
 170   `consumed_char' of the structure *CODING.  They also set the member
 171   `result' to one of CODING_FINISH_XXX indicating how the encoding
 172   finished.
 173
 174   DST_BYTES zero means that the source area and destination area are
 175   overlapped, which means that we can produce encoded text until it
 176   reaches at the head of the not-yet-encoded source text.
 177
 178   Below is a template for these functions.  */
 179 #if 0
 180 static void
 181 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 182      struct coding_system *coding;
 183      unsigned char *source, *destination;
 184      int src_bytes, dst_bytes;
 185 {
 186   ...
 187 }
 188 #endif
 189
 190 /*** COMMONLY USED MACROS ***/
 191
 192 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 193    get one, two, and three bytes from the source text respectively.
 194    If there are not enough bytes in the source, they jump to
 195    `label_end_of_loop'.  The caller should set variables `coding',
 196    `src' and `src_end' to appropriate pointer in advance.  These
 197    macros are called from decoding routines `decode_coding_XXX', thus
 198    it is assumed that the source text is unibyte.  */
 199
 200 #define ONE_MORE_BYTE(c1)                                       \
 201   do {                                                          \
 202     if (src >= src_end)                                         \
 203       {                                                         \
 204         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 205         goto label_end_of_loop;                                 \
 206       }                                                         \
 207     c1 = *src++;                                                \
 208   } while (0)
 209
 210 #define TWO_MORE_BYTES(c1, c2)                                  \
 211   do {                                                          \
 212     if (src + 1 >= src_end)                                     \
 213       {                                                         \
 214         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 215         goto label_end_of_loop;                                 \
 216       }                                                         \
 217     c1 = *src++;                                                \
 218     c2 = *src++;                                                \
 219   } while (0)
 220
 221
 222 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 223    form if MULTIBYTEP is nonzero.  In addition, if SRC is not less
 224    than SRC_END, return with RET.  */
 225
 226 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret)      \
 227   do {                                                          \
 228     if (src >= src_end)                                         \
 229       {                                                         \
 230         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 231         return ret;                                             \
 232       }                                                         \
 233     c1 = *src++;                                                \
 234     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 235       c1 = *src++ - 0x20;                                       \
 236   } while (0)
 237
 238 /* Set C to the next character at the source text pointed by `src'.
 239    If there are not enough characters in the source, jump to
 240    `label_end_of_loop'.  The caller should set variables `coding'
 241    `src', `src_end', and `translation_table' to appropriate pointers
 242    in advance.  This macro is used in encoding routines
 243    `encode_coding_XXX', thus it assumes that the source text is in
 244    multibyte form except for 8-bit characters.  8-bit characters are
 245    in multibyte form if coding->src_multibyte is nonzero, else they
 246    are represented by a single byte.  */
 247
 248 #define ONE_MORE_CHAR(c)                                        \
 249   do {                                                          \
 250     int len = src_end - src;                                    \
 251     int bytes;                                                  \
 252     if (len <= 0)                                               \
 253       {                                                         \
 254         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 255         goto label_end_of_loop;                                 \
 256       }                                                         \
 257     if (coding->src_multibyte                                   \
 258         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 259       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 260     else                                                        \
 261       c = *src, bytes = 1;                                      \
 262     if (!NILP (translation_table))                              \
 263       c = translate_char (translation_table, c, -1, 0, 0);      \
 264     src += bytes;                                               \
 265   } while (0)
 266
 267
 268 /* Produce a multibyte form of character C to `dst'.  Jump to
 269    `label_end_of_loop' if there's not enough space at `dst'.
 270
 271    If we are now in the middle of a composition sequence, the decoded
 272    character may be ALTCHAR (for the current composition).  In that
 273    case, the character goes to coding->cmp_data->data instead of
 274    `dst'.
 275
 276    This macro is used in decoding routines.  */
 277
 278 #define EMIT_CHAR(c)                                                    \
 279   do {                                                                  \
 280     if (! COMPOSING_P (coding)                                          \
 281         || coding->composing == COMPOSITION_RELATIVE                    \
 282         || coding->composing == COMPOSITION_WITH_RULE)                  \
 283       {                                                                 \
 284         int bytes = CHAR_BYTES (c);                                     \
 285         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 286           {                                                             \
 287             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 288             goto label_end_of_loop;                                     \
 289           }                                                             \
 290         dst += CHAR_STRING (c, dst);                                    \
 291         coding->produced_char++;                                        \
 292       }                                                                 \
 293                                                                         \
 294     if (COMPOSING_P (coding)                                            \
 295         && coding->composing != COMPOSITION_RELATIVE)                   \
 296       {                                                                 \
 297         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 298         coding->composition_rule_follows                                \
 299           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 300       }                                                                 \
 301   } while (0)
 302
 303
 304 #define EMIT_ONE_BYTE(c)                                        \
 305   do {                                                          \
 306     if (dst >= (dst_bytes ? dst_end : src))                     \
 307       {                                                         \
 308         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 309         goto label_end_of_loop;                                 \
 310       }                                                         \
 311     *dst++ = c;                                                 \
 312   } while (0)
 313
 314 #define EMIT_TWO_BYTES(c1, c2)                                  \
 315   do {                                                          \
 316     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 317       {                                                         \
 318         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 319         goto label_end_of_loop;                                 \
 320       }                                                         \
 321     *dst++ = c1, *dst++ = c2;                                   \
 322   } while (0)
 323
 324 #define EMIT_BYTES(from, to)                                    \
 325   do {                                                          \
 326     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 327       {                                                         \
 328         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 329         goto label_end_of_loop;                                 \
 330       }                                                         \
 331     while (from < to)                                           \
 332       *dst++ = *from++;                                         \
 333   } while (0)
 334
 335 \f
 336 /*** 1. Preamble ***/
 337
 338 #ifdef emacs
 339 #include <config.h>
 340 #endif
 341
 342 #include <stdio.h>
 343
 344 #ifdef emacs
 345
 346 #include "lisp.h"
 347 #include "buffer.h"
 348 #include "charset.h"
 349 #include "composite.h"
 350 #include "ccl.h"
 351 #include "coding.h"
 352 #include "window.h"
 353 #include "intervals.h"
 354
 355 #else  /* not emacs */
 356
 357 #include "mulelib.h"
 358
 359 #endif /* not emacs */
 360
 361 Lisp_Object Qcoding_system, Qeol_type;
 362 Lisp_Object Qbuffer_file_coding_system;
 363 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 364 Lisp_Object Qno_conversion, Qundecided;
 365 Lisp_Object Qcoding_system_history;
 366 Lisp_Object Qsafe_chars;
 367 Lisp_Object Qvalid_codes;
 368 Lisp_Object Qascii_incompatible;
 369
 370 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 371 Lisp_Object Qcall_process, Qcall_process_region;
 372 Lisp_Object Qstart_process, Qopen_network_stream;
 373 Lisp_Object Qtarget_idx;
 374
 375 extern Lisp_Object Qcompletion_ignore_case;
 376
 377 /* If a symbol has this property, evaluate the value to define the
 378    symbol as a coding system.  */
 379 Lisp_Object Qcoding_system_define_form;
 380
 381 Lisp_Object Vselect_safe_coding_system_function;
 382
 383 int coding_system_require_warning;
 384
 385 /* Mnemonic string for each format of end-of-line.  */
 386 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 387 /* Mnemonic string to indicate format of end-of-line is not yet
 388    decided.  */
 389 Lisp_Object eol_mnemonic_undecided;
 390
 391 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 392    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
 393    This has an effect only for external encoding (i.e. for output to
 394    file and process), not for in-buffer or Lisp string encoding.  */
 395 int system_eol_type;
 396
 397 #ifdef emacs
 398
 399 /* Information about which coding system is safe for which chars.
 400    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 401
 402    GENERIC-LIST is a list of generic coding systems which can encode
 403    any characters.
 404
 405    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 406    corresponding char table that contains safe chars.  */
 407 Lisp_Object Vcoding_system_safe_chars;
 408
 409 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 410
 411 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 412
 413 /* Coding system emacs-mule and raw-text are for converting only
 414    end-of-line format.  */
 415 Lisp_Object Qemacs_mule, Qraw_text;
 416
 417 Lisp_Object Qutf_8;
 418
 419 /* Coding-systems are handed between Emacs Lisp programs and C internal
 420    routines by the following three variables.  */
 421 /* Coding-system for reading files and receiving data from process.  */
 422 Lisp_Object Vcoding_system_for_read;
 423 /* Coding-system for writing files and sending data to process.  */
 424 Lisp_Object Vcoding_system_for_write;
 425 /* Coding-system actually used in the latest I/O.  */
 426 Lisp_Object Vlast_coding_system_used;
 427
 428 /* A vector of length 256 which contains information about special
 429    Latin codes (especially for dealing with Microsoft codes).  */
 430 Lisp_Object Vlatin_extra_code_table;
 431
 432 /* Flag to inhibit code conversion of end-of-line format.  */
 433 int inhibit_eol_conversion;
 434
 435 /* Flag to inhibit ISO2022 escape sequence detection.  */
 436 int inhibit_iso_escape_detection;
 437
 438 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 439 int inherit_process_coding_system;
 440
 441 /* Coding system to be used to encode text for terminal display.  */
 442 struct coding_system terminal_coding;
 443
 444 /* Coding system to be used to encode text for terminal display when
 445    terminal coding system is nil.  */
 446 struct coding_system safe_terminal_coding;
 447
 448 /* Coding system of what is sent from terminal keyboard.  */
 449 struct coding_system keyboard_coding;
 450
 451 /* Default coding system to be used to write a file.  */
 452 struct coding_system default_buffer_file_coding;
 453
 454 Lisp_Object Vfile_coding_system_alist;
 455 Lisp_Object Vprocess_coding_system_alist;
 456 Lisp_Object Vnetwork_coding_system_alist;
 457
 458 Lisp_Object Vlocale_coding_system;
 459
 460 #endif /* emacs */
 461
 462 Lisp_Object Qcoding_category, Qcoding_category_index;
 463
 464 /* List of symbols `coding-category-xxx' ordered by priority.  */
 465 Lisp_Object Vcoding_category_list;
 466
 467 /* Table of coding categories (Lisp symbols).  */
 468 Lisp_Object Vcoding_category_table;
 469
 470 /* Table of names of symbol for each coding-category.  */
 471 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 472   "coding-category-emacs-mule",
 473   "coding-category-sjis",
 474   "coding-category-iso-7",
 475   "coding-category-iso-7-tight",
 476   "coding-category-iso-8-1",
 477   "coding-category-iso-8-2",
 478   "coding-category-iso-7-else",
 479   "coding-category-iso-8-else",
 480   "coding-category-ccl",
 481   "coding-category-big5",
 482   "coding-category-utf-8",
 483   "coding-category-utf-16-be",
 484   "coding-category-utf-16-le",
 485   "coding-category-raw-text",
 486   "coding-category-binary"
 487 };
 488
 489 /* Table of pointers to coding systems corresponding to each coding
 490    categories.  */
 491 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 492
 493 /* Table of coding category masks.  Nth element is a mask for a coding
 494    category of which priority is Nth.  */
 495 static
 496 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 497
 498 /* Flag to tell if we look up translation table on character code
 499    conversion.  */
 500 Lisp_Object Venable_character_translation;
 501 /* Standard translation table to look up on decoding (reading).  */
 502 Lisp_Object Vstandard_translation_table_for_decode;
 503 /* Standard translation table to look up on encoding (writing).  */
 504 Lisp_Object Vstandard_translation_table_for_encode;
 505
 506 Lisp_Object Qtranslation_table;
 507 Lisp_Object Qtranslation_table_id;
 508 Lisp_Object Qtranslation_table_for_decode;
 509 Lisp_Object Qtranslation_table_for_encode;
 510
 511 /* Alist of charsets vs revision number.  */
 512 Lisp_Object Vcharset_revision_alist;
 513
 514 /* Default coding systems used for process I/O.  */
 515 Lisp_Object Vdefault_process_coding_system;
 516
 517 /* Char table for translating Quail and self-inserting input.  */
 518 Lisp_Object Vtranslation_table_for_input;
 519
 520 /* Global flag to tell that we can't call post-read-conversion and
 521    pre-write-conversion functions.  Usually the value is zero, but it
 522    is set to 1 temporarily while such functions are running.  This is
 523    to avoid infinite recursive call.  */
 524 static int inhibit_pre_post_conversion;
 525
 526 Lisp_Object Qchar_coding_system;
 527
 528 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 529    its validity.  */
 530
 531 Lisp_Object
 532 coding_safe_chars (coding_system)
 533      Lisp_Object coding_system;
 534 {
 535   Lisp_Object coding_spec, plist, safe_chars;
 536
 537   coding_spec = Fget (coding_system, Qcoding_system);
 538   plist = XVECTOR (coding_spec)->contents[3];
 539   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 540   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 541 }
 542
 543 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 544   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 545
 546 \f
 547 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 548
 549 /* Emacs' internal format for representation of multiple character
 550    sets is a kind of multi-byte encoding, i.e. characters are
 551    represented by variable-length sequences of one-byte codes.
 552
 553    ASCII characters and control characters (e.g. `tab', `newline') are
 554    represented by one-byte sequences which are their ASCII codes, in
 555    the range 0x00 through 0x7F.
 556
 557    8-bit characters of the range 0x80..0x9F are represented by
 558    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 559    code + 0x20).
 560
 561    8-bit characters of the range 0xA0..0xFF are represented by
 562    one-byte sequences which are their 8-bit code.
 563
 564    The other characters are represented by a sequence of `base
 565    leading-code', optional `extended leading-code', and one or two
 566    `position-code's.  The length of the sequence is determined by the
 567    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 568    whereas extended leading-code and position-code take the range 0xA0
 569    through 0xFF.  See `charset.h' for more details about leading-code
 570    and position-code.
 571
 572    --- CODE RANGE of Emacs' internal format ---
 573    character set        range
 574    -------------        -----
 575    ascii                0x00..0x7F
 576    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 577    eight-bit-graphic    0xA0..0xBF
 578    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 579    ---------------------------------------------
 580
 581    As this is the internal character representation, the format is
 582    usually not used externally (i.e. in a file or in a data sent to a
 583    process).  But, it is possible to have a text externally in this
 584    format (i.e. by encoding by the coding system `emacs-mule').
 585
 586    In that case, a sequence of one-byte codes has a slightly different
 587    form.
 588
 589    Firstly, all characters in eight-bit-control are represented by
 590    one-byte sequences which are their 8-bit code.
 591
 592    Next, character composition data are represented by the byte
 593    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 594    where,
 595         METHOD is 0xF0 plus one of composition method (enum
 596         composition_method),
 597
 598         BYTES is 0xA0 plus the byte length of these composition data,
 599
 600         CHARS is 0xA0 plus the number of characters composed by these
 601         data,
 602
 603         COMPONENTs are characters of multibyte form or composition
 604         rules encoded by two-byte of ASCII codes.
 605
 606    In addition, for backward compatibility, the following formats are
 607    also recognized as composition data on decoding.
 608
 609    0x80 MSEQ ...
 610    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 611
 612    Here,
 613         MSEQ is a multibyte form but in these special format:
 614           ASCII: 0xA0 ASCII_CODE+0x80,
 615           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 616         RULE is a one byte code of the range 0xA0..0xF0 that
 617         represents a composition rule.
 618   */
 619
 620 enum emacs_code_class_type emacs_code_class[256];
 621
 622 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 623    Check if a text is encoded in Emacs' internal format.  If it is,
 624    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 625
 626 static int
 627 detect_coding_emacs_mule (src, src_end, multibytep)
 628       unsigned char *src, *src_end;
 629       int multibytep;
 630 {
 631   unsigned char c;
 632   int composing = 0;
 633   /* Dummy for ONE_MORE_BYTE.  */
 634   struct coding_system dummy_coding;
 635   struct coding_system *coding = &dummy_coding;
 636
 637   while (1)
 638     {
 639       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
 640                                      CODING_CATEGORY_MASK_EMACS_MULE);
 641       if (composing)
 642         {
 643           if (c < 0xA0)
 644             composing = 0;
 645           else if (c == 0xA0)
 646             {
 647               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
 648               c &= 0x7F;
 649             }
 650           else
 651             c -= 0x20;
 652         }
 653
 654       if (c < 0x20)
 655         {
 656           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 657             return 0;
 658         }
 659       else if (c >= 0x80 && c < 0xA0)
 660         {
 661           if (c == 0x80)
 662             /* Old leading code for a composite character.  */
 663             composing = 1;
 664           else
 665             {
 666               unsigned char *src_base = src - 1;
 667               int bytes;
 668
 669               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 670                                                bytes))
 671                 return 0;
 672               src = src_base + bytes;
 673             }
 674         }
 675     }
 676 }
 677
 678
 679 /* Record the starting position START and METHOD of one composition.  */
 680
 681 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 682   do {                                                          \
 683     struct composition_data *cmp_data = coding->cmp_data;       \
 684     int *data = cmp_data->data + cmp_data->used;                \
 685     coding->cmp_data_start = cmp_data->used;                    \
 686     data[0] = -1;                                               \
 687     data[1] = cmp_data->char_offset + start;                    \
 688     data[3] = (int) method;                                     \
 689     cmp_data->used += 4;                                        \
 690   } while (0)
 691
 692 /* Record the ending position END of the current composition.  */
 693
 694 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 695   do {                                                          \
 696     struct composition_data *cmp_data = coding->cmp_data;       \
 697     int *data = cmp_data->data + coding->cmp_data_start;        \
 698     data[0] = cmp_data->used - coding->cmp_data_start;          \
 699     data[2] = cmp_data->char_offset + end;                      \
 700   } while (0)
 701
 702 /* Record one COMPONENT (alternate character or composition rule).  */
 703
 704 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 705   do {                                                                  \
 706     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 707     if (coding->cmp_data->used - coding->cmp_data_start                 \
 708         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 709       {                                                                 \
 710         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 711         coding->composing = COMPOSITION_NO;                             \
 712       }                                                                 \
 713   } while (0)
 714
 715
 716 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 717    is not less than SRC_END, return -1 without incrementing Src.  */
 718
 719 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 720
 721
 722 /* Decode a character represented as a component of composition
 723    sequence of Emacs 20 style at SRC.  Set C to that character, store
 724    its multibyte form sequence at P, and set P to the end of that
 725    sequence.  If no valid character is found, set C to -1.  */
 726
 727 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 728   do {                                                          \
 729     int bytes;                                                  \
 730                                                                 \
 731     c = SAFE_ONE_MORE_BYTE ();                                  \
 732     if (c < 0)                                                  \
 733       break;                                                    \
 734     if (CHAR_HEAD_P (c))                                        \
 735       c = -1;                                                   \
 736     else if (c == 0xA0)                                         \
 737       {                                                         \
 738         c = SAFE_ONE_MORE_BYTE ();                              \
 739         if (c < 0xA0)                                           \
 740           c = -1;                                               \
 741         else                                                    \
 742           {                                                     \
 743             c -= 0x80;                                          \
 744             *p++ = c;                                           \
 745           }                                                     \
 746       }                                                         \
 747     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 748       {                                                         \
 749         unsigned char *p0 = p;                                  \
 750                                                                 \
 751         c -= 0x20;                                              \
 752         *p++ = c;                                               \
 753         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 754         while (--bytes)                                         \
 755           {                                                     \
 756             c = SAFE_ONE_MORE_BYTE ();                          \
 757             if (c < 0)                                          \
 758               break;                                            \
 759             *p++ = c;                                           \
 760           }                                                     \
 761         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 762             || (coding->flags /* We are recovering a file.  */  \
 763                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 764                 && ! CHAR_HEAD_P (p0[1])))                      \
 765           c = STRING_CHAR (p0, bytes);                          \
 766         else                                                    \
 767           c = -1;                                               \
 768       }                                                         \
 769     else                                                        \
 770       c = -1;                                                   \
 771   } while (0)
 772
 773
 774 /* Decode a composition rule represented as a component of composition
 775    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 776    valid rule is found, set C to -1.  */
 777
 778 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 779   do {                                                  \
 780     c = SAFE_ONE_MORE_BYTE ();                          \
 781     c -= 0xA0;                                          \
 782     if (c < 0 || c >= 81)                               \
 783       c = -1;                                           \
 784     else                                                \
 785       {                                                 \
 786         gref = c / 9, nref = c % 9;                     \
 787         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 788       }                                                 \
 789   } while (0)
 790
 791
 792 /* Decode composition sequence encoded by `emacs-mule' at the source
 793    pointed by SRC.  SRC_END is the end of source.  Store information
 794    of the composition in CODING->cmp_data.
 795
 796    For backward compatibility, decode also a composition sequence of
 797    Emacs 20 style.  In that case, the composition sequence contains
 798    characters that should be extracted into a buffer or string.  Store
 799    those characters at *DESTINATION in multibyte form.
 800
 801    If we encounter an invalid byte sequence, return 0.
 802    If we encounter an insufficient source or destination, or
 803    insufficient space in CODING->cmp_data, return 1.
 804    Otherwise, return consumed bytes in the source.
 805
 806 */
 807 static INLINE int
 808 decode_composition_emacs_mule (coding, src, src_end,
 809                                destination, dst_end, dst_bytes)
 810      struct coding_system *coding;
 811      const unsigned char *src, *src_end;
 812      unsigned char **destination, *dst_end;
 813      int dst_bytes;
 814 {
 815   unsigned char *dst = *destination;
 816   int method, data_len, nchars;
 817   const unsigned char *src_base = src++;
 818   /* Store components of composition.  */
 819   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 820   int ncomponent;
 821   /* Store multibyte form of characters to be composed.  This is for
 822      Emacs 20 style composition sequence.  */
 823   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 824   unsigned char *bufp = buf;
 825   int c, i, gref, nref;
 826
 827   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 828       >= COMPOSITION_DATA_SIZE)
 829     {
 830       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 831       return -1;
 832     }
 833
 834   ONE_MORE_BYTE (c);
 835   if (c - 0xF0 >= COMPOSITION_RELATIVE
 836            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 837     {
 838       int with_rule;
 839
 840       method = c - 0xF0;
 841       with_rule = (method == COMPOSITION_WITH_RULE
 842                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 843       ONE_MORE_BYTE (c);
 844       data_len = c - 0xA0;
 845       if (data_len < 4
 846           || src_base + data_len > src_end)
 847         return 0;
 848       ONE_MORE_BYTE (c);
 849       nchars = c - 0xA0;
 850       if (c < 1)
 851         return 0;
 852       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 853         {
 854           /* If it is longer than this, it can't be valid.  */
 855           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 856             return 0;
 857
 858           if (ncomponent % 2 && with_rule)
 859             {
 860               ONE_MORE_BYTE (gref);
 861               gref -= 32;
 862               ONE_MORE_BYTE (nref);
 863               nref -= 32;
 864               c = COMPOSITION_ENCODE_RULE (gref, nref);
 865             }
 866           else
 867             {
 868               int bytes;
 869               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 870                   || (coding->flags /* We are recovering a file.  */
 871                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 872                       && ! CHAR_HEAD_P (src[1])))
 873                 c = STRING_CHAR (src, bytes);
 874               else
 875                 c = *src, bytes = 1;
 876               src += bytes;
 877             }
 878           component[ncomponent] = c;
 879         }
 880     }
 881   else if (c >= 0x80)
 882     {
 883       /* This may be an old Emacs 20 style format.  See the comment at
 884          the section 2 of this file.  */
 885       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 886       if (src == src_end
 887           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 888         goto label_end_of_loop;
 889
 890       src_end = src;
 891       src = src_base + 1;
 892       if (c < 0xC0)
 893         {
 894           method = COMPOSITION_RELATIVE;
 895           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 896             {
 897               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 898               if (c < 0)
 899                 break;
 900               component[ncomponent++] = c;
 901             }
 902           if (ncomponent < 2)
 903             return 0;
 904           nchars = ncomponent;
 905         }
 906       else if (c == 0xFF)
 907         {
 908           method = COMPOSITION_WITH_RULE;
 909           src++;
 910           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 911           if (c < 0)
 912             return 0;
 913           component[0] = c;
 914           for (ncomponent = 1;
 915                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 916             {
 917               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 918               if (c < 0)
 919                 break;
 920               component[ncomponent++] = c;
 921               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 922               if (c < 0)
 923                 break;
 924               component[ncomponent++] = c;
 925             }
 926           if (ncomponent < 3)
 927             return 0;
 928           nchars = (ncomponent + 1) / 2;
 929         }
 930       else
 931         return 0;
 932     }
 933   else
 934     return 0;
 935
 936   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 937     {
 938       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 939       for (i = 0; i < ncomponent; i++)
 940         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 941       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 942       if (buf < bufp)
 943         {
 944           unsigned char *p = buf;
 945           EMIT_BYTES (p, bufp);
 946           *destination += bufp - buf;
 947           coding->produced_char += nchars;
 948         }
 949       return (src - src_base);
 950     }
 951  label_end_of_loop:
 952   return -1;
 953 }
 954
 955 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 956
 957 static void
 958 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 959      struct coding_system *coding;
 960      const unsigned char *source;
 961      unsigned char *destination;
 962      int src_bytes, dst_bytes;
 963 {
 964   const unsigned char *src = source;
 965   const unsigned char *src_end = source + src_bytes;
 966   unsigned char *dst = destination;
 967   unsigned char *dst_end = destination + dst_bytes;
 968   /* SRC_BASE remembers the start position in source in each loop.
 969      The loop will be exited when there's not enough source code, or
 970      when there's not enough destination area to produce a
 971      character.  */
 972   const unsigned char *src_base;
 973
 974   coding->produced_char = 0;
 975   while ((src_base = src) < src_end)
 976     {
 977       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 978       const unsigned char *p;
 979       int bytes;
 980
 981       if (*src == '\r')
 982         {
 983           int c = *src++;
 984
 985           if (coding->eol_type == CODING_EOL_CR)
 986             c = '\n';
 987           else if (coding->eol_type == CODING_EOL_CRLF)
 988             {
 989               ONE_MORE_BYTE (c);
 990               if (c != '\n')
 991                 {
 992                   src--;
 993                   c = '\r';
 994                 }
 995             }
 996           *dst++ = c;
 997           coding->produced_char++;
 998           continue;
 999         }
1000       else if (*src == '\n')
1001         {
1002           if ((coding->eol_type == CODING_EOL_CR
1003                || coding->eol_type == CODING_EOL_CRLF)
1004               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1005             {
1006               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1007               goto label_end_of_loop;
1008             }
1009           *dst++ = *src++;
1010           coding->produced_char++;
1011           continue;
1012         }
1013       else if (*src == 0x80 && coding->cmp_data)
1014         {
1015           /* Start of composition data.  */
1016           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1017                                                          &dst, dst_end,
1018                                                          dst_bytes);
1019           if (consumed < 0)
1020             goto label_end_of_loop;
1021           else if (consumed > 0)
1022             {
1023               src += consumed;
1024               continue;
1025             }
1026           bytes = CHAR_STRING (*src, tmp);
1027           p = tmp;
1028           src++;
1029         }
1030       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1031                || (coding->flags /* We are recovering a file.  */
1032                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1033                    && ! CHAR_HEAD_P (src[1])))
1034         {
1035           p = src;
1036           src += bytes;
1037         }
1038       else
1039         {
1040           int i, c;
1041
1042           bytes = BYTES_BY_CHAR_HEAD (*src);
1043           src++;
1044           for (i = 1; i < bytes; i++)
1045             {
1046               ONE_MORE_BYTE (c);
1047               if (CHAR_HEAD_P (c))
1048                 break;
1049             }
1050           if (i < bytes)
1051             {
1052               bytes = CHAR_STRING (*src_base, tmp);
1053               p = tmp;
1054               src = src_base + 1;
1055             }
1056           else
1057             {
1058               p = src_base;
1059             }
1060         }
1061       if (dst + bytes >= (dst_bytes ? dst_end : src))
1062         {
1063           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1064           break;
1065         }
1066       while (bytes--) *dst++ = *p++;
1067       coding->produced_char++;
1068     }
1069  label_end_of_loop:
1070   coding->consumed = coding->consumed_char = src_base - source;
1071   coding->produced = dst - destination;
1072 }
1073
1074
1075 /* Encode composition data stored at DATA into a special byte sequence
1076    starting by 0x80.  Update CODING->cmp_data_start and maybe
1077    CODING->cmp_data for the next call.  */
1078
1079 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1080   do {                                                                  \
1081     unsigned char buf[1024], *p0 = buf, *p;                             \
1082     int len = data[0];                                                  \
1083     int i;                                                              \
1084                                                                         \
1085     buf[0] = 0x80;                                                      \
1086     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1087     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1088     p = buf + 4;                                                        \
1089     if (data[3] == COMPOSITION_WITH_RULE                                \
1090         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1091       {                                                                 \
1092         p += CHAR_STRING (data[4], p);                                  \
1093         for (i = 5; i < len; i += 2)                                    \
1094           {                                                             \
1095             int gref, nref;                                             \
1096              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1097             *p++ = 0x20 + gref;                                         \
1098             *p++ = 0x20 + nref;                                         \
1099             p += CHAR_STRING (data[i + 1], p);                          \
1100           }                                                             \
1101       }                                                                 \
1102     else                                                                \
1103       {                                                                 \
1104         for (i = 4; i < len; i++)                                       \
1105           p += CHAR_STRING (data[i], p);                                \
1106       }                                                                 \
1107     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1108                                                                         \
1109     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1110       {                                                                 \
1111         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1112         goto label_end_of_loop;                                         \
1113       }                                                                 \
1114     while (p0 < p)                                                      \
1115       *dst++ = *p0++;                                                   \
1116     coding->cmp_data_start += data[0];                                  \
1117     if (coding->cmp_data_start == coding->cmp_data->used                \
1118         && coding->cmp_data->next)                                      \
1119       {                                                                 \
1120         coding->cmp_data = coding->cmp_data->next;                      \
1121         coding->cmp_data_start = 0;                                     \
1122       }                                                                 \
1123   } while (0)
1124
1125
1126 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1127                             unsigned char *, int, int));
1128
1129 static void
1130 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1131      struct coding_system *coding;
1132      const unsigned char *source;
1133      unsigned char *destination;
1134      int src_bytes, dst_bytes;
1135 {
1136   const unsigned char *src = source;
1137   const unsigned char *src_end = source + src_bytes;
1138   unsigned char *dst = destination;
1139   unsigned char *dst_end = destination + dst_bytes;
1140   const unsigned char *src_base;
1141   int c;
1142   int char_offset;
1143   int *data;
1144
1145   Lisp_Object translation_table;
1146
1147   translation_table = Qnil;
1148
1149   /* Optimization for the case that there's no composition.  */
1150   if (!coding->cmp_data || coding->cmp_data->used == 0)
1151     {
1152       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1153       return;
1154     }
1155
1156   char_offset = coding->cmp_data->char_offset;
1157   data = coding->cmp_data->data + coding->cmp_data_start;
1158   while (1)
1159     {
1160       src_base = src;
1161
1162       /* If SRC starts a composition, encode the information about the
1163          composition in advance.  */
1164       if (coding->cmp_data_start < coding->cmp_data->used
1165           && char_offset + coding->consumed_char == data[1])
1166         {
1167           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1168           char_offset = coding->cmp_data->char_offset;
1169           data = coding->cmp_data->data + coding->cmp_data_start;
1170         }
1171
1172       ONE_MORE_CHAR (c);
1173       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1174                         || coding->eol_type == CODING_EOL_CR))
1175         {
1176           if (coding->eol_type == CODING_EOL_CRLF)
1177             EMIT_TWO_BYTES ('\r', c);
1178           else
1179             EMIT_ONE_BYTE ('\r');
1180         }
1181       else if (SINGLE_BYTE_CHAR_P (c))
1182         {
1183           if (coding->flags && ! ASCII_BYTE_P (c))
1184             {
1185               /* As we are auto saving, retain the multibyte form for
1186                  8-bit chars.  */
1187               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1188               int bytes = CHAR_STRING (c, buf);
1189
1190               if (bytes == 1)
1191                 EMIT_ONE_BYTE (buf[0]);
1192               else
1193                 EMIT_TWO_BYTES (buf[0], buf[1]);
1194             }
1195           else
1196             EMIT_ONE_BYTE (c);
1197         }
1198       else
1199         EMIT_BYTES (src_base, src);
1200       coding->consumed_char++;
1201     }
1202  label_end_of_loop:
1203   coding->consumed = src_base - source;
1204   coding->produced = coding->produced_char = dst - destination;
1205   return;
1206 }
1207
1208 \f
1209 /*** 3. ISO2022 handlers ***/
1210
1211 /* The following note describes the coding system ISO2022 briefly.
1212    Since the intention of this note is to help understand the
1213    functions in this file, some parts are NOT ACCURATE or are OVERLY
1214    SIMPLIFIED.  For thorough understanding, please refer to the
1215    original document of ISO2022.  This is equivalent to the standard
1216    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1217
1218    ISO2022 provides many mechanisms to encode several character sets
1219    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1220    is encoded using bytes less than 128.  This may make the encoded
1221    text a little bit longer, but the text passes more easily through
1222    several types of gateway, some of which strip off the MSB (Most
1223    Significant Bit).
1224
1225    There are two kinds of character sets: control character sets and
1226    graphic character sets.  The former contain control characters such
1227    as `newline' and `escape' to provide control functions (control
1228    functions are also provided by escape sequences).  The latter
1229    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1230    two control character sets and many graphic character sets.
1231
1232    Graphic character sets are classified into one of the following
1233    four classes, according to the number of bytes (DIMENSION) and
1234    number of characters in one dimension (CHARS) of the set:
1235    - DIMENSION1_CHARS94
1236    - DIMENSION1_CHARS96
1237    - DIMENSION2_CHARS94
1238    - DIMENSION2_CHARS96
1239
1240    In addition, each character set is assigned an identification tag,
1241    unique for each set, called the "final character" (denoted as <F>
1242    hereafter).  The <F> of each character set is decided by ECMA(*)
1243    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1244    (0x30..0x3F are for private use only).
1245
1246    Note (*): ECMA = European Computer Manufacturers Association
1247
1248    Here are examples of graphic character sets [NAME(<F>)]:
1249         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1250         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1251         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1252         o DIMENSION2_CHARS96 -- none for the moment
1253
1254    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1255         C0 [0x00..0x1F] -- control character plane 0
1256         GL [0x20..0x7F] -- graphic character plane 0
1257         C1 [0x80..0x9F] -- control character plane 1
1258         GR [0xA0..0xFF] -- graphic character plane 1
1259
1260    A control character set is directly designated and invoked to C0 or
1261    C1 by an escape sequence.  The most common case is that:
1262    - ISO646's  control character set is designated/invoked to C0, and
1263    - ISO6429's control character set is designated/invoked to C1,
1264    and usually these designations/invocations are omitted in encoded
1265    text.  In a 7-bit environment, only C0 can be used, and a control
1266    character for C1 is encoded by an appropriate escape sequence to
1267    fit into the environment.  All control characters for C1 are
1268    defined to have corresponding escape sequences.
1269
1270    A graphic character set is at first designated to one of four
1271    graphic registers (G0 through G3), then these graphic registers are
1272    invoked to GL or GR.  These designations and invocations can be
1273    done independently.  The most common case is that G0 is invoked to
1274    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1275    these invocations and designations are omitted in encoded text.
1276    In a 7-bit environment, only GL can be used.
1277
1278    When a graphic character set of CHARS94 is invoked to GL, codes
1279    0x20 and 0x7F of the GL area work as control characters SPACE and
1280    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1281    be used.
1282
1283    There are two ways of invocation: locking-shift and single-shift.
1284    With locking-shift, the invocation lasts until the next different
1285    invocation, whereas with single-shift, the invocation affects the
1286    following character only and doesn't affect the locking-shift
1287    state.  Invocations are done by the following control characters or
1288    escape sequences:
1289
1290    ----------------------------------------------------------------------
1291    abbrev  function                  cntrl escape seq   description
1292    ----------------------------------------------------------------------
1293    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1294    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1295    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1296    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1297    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1298    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1299    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1300    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1301    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1302    ----------------------------------------------------------------------
1303    (*) These are not used by any known coding system.
1304
1305    Control characters for these functions are defined by macros
1306    ISO_CODE_XXX in `coding.h'.
1307
1308    Designations are done by the following escape sequences:
1309    ----------------------------------------------------------------------
1310    escape sequence      description
1311    ----------------------------------------------------------------------
1312    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1313    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1314    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1315    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1316    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1317    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1318    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1319    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1320    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1321    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1322    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1323    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1324    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1325    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1326    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1327    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1328    ----------------------------------------------------------------------
1329
1330    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1331    of dimension 1, chars 94, and final character <F>, etc...
1332
1333    Note (*): Although these designations are not allowed in ISO2022,
1334    Emacs accepts them on decoding, and produces them on encoding
1335    CHARS96 character sets in a coding system which is characterized as
1336    7-bit environment, non-locking-shift, and non-single-shift.
1337
1338    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1339    '(' can be omitted.  We refer to this as "short-form" hereafter.
1340
1341    Now you may notice that there are a lot of ways of encoding the
1342    same multilingual text in ISO2022.  Actually, there exist many
1343    coding systems such as Compound Text (used in X11's inter client
1344    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1345    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1346    localized platforms), and all of these are variants of ISO2022.
1347
1348    In addition to the above, Emacs handles two more kinds of escape
1349    sequences: ISO6429's direction specification and Emacs' private
1350    sequence for specifying character composition.
1351
1352    ISO6429's direction specification takes the following form:
1353         o CSI ']'      -- end of the current direction
1354         o CSI '0' ']'  -- end of the current direction
1355         o CSI '1' ']'  -- start of left-to-right text
1356         o CSI '2' ']'  -- start of right-to-left text
1357    The control character CSI (0x9B: control sequence introducer) is
1358    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1359
1360    Character composition specification takes the following form:
1361         o ESC '0' -- start relative composition
1362         o ESC '1' -- end composition
1363         o ESC '2' -- start rule-base composition (*)
1364         o ESC '3' -- start relative composition with alternate chars  (**)
1365         o ESC '4' -- start rule-base composition with alternate chars  (**)
1366   Since these are not standard escape sequences of any ISO standard,
1367   the use of them with these meanings is restricted to Emacs only.
1368
1369   (*) This form is used only in Emacs 20.5 and older versions,
1370   but the newer versions can safely decode it.
1371   (**) This form is used only in Emacs 21.1 and newer versions,
1372   and the older versions can't decode it.
1373
1374   Here's a list of example usages of these composition escape
1375   sequences (categorized by `enum composition_method').
1376
1377   COMPOSITION_RELATIVE:
1378         ESC 0 CHAR [ CHAR ] ESC 1
1379   COMPOSITION_WITH_RULE:
1380         ESC 2 CHAR [ RULE CHAR ] ESC 1
1381   COMPOSITION_WITH_ALTCHARS:
1382         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1383   COMPOSITION_WITH_RULE_ALTCHARS:
1384         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1385
1386 enum iso_code_class_type iso_code_class[256];
1387
1388 #define CHARSET_OK(idx, charset, c)                                     \
1389   (coding_system_table[idx]                                             \
1390    && (charset == CHARSET_ASCII                                         \
1391        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1392            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1393    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1394                                               charset)                  \
1395        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1396
1397 #define SHIFT_OUT_OK(idx) \
1398   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1399
1400 #define COMPOSITION_OK(idx)     \
1401   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1402
1403 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1404    Check if a text is encoded in ISO2022.  If it is, return an
1405    integer in which appropriate flag bits any of:
1406         CODING_CATEGORY_MASK_ISO_7
1407         CODING_CATEGORY_MASK_ISO_7_TIGHT
1408         CODING_CATEGORY_MASK_ISO_8_1
1409         CODING_CATEGORY_MASK_ISO_8_2
1410         CODING_CATEGORY_MASK_ISO_7_ELSE
1411         CODING_CATEGORY_MASK_ISO_8_ELSE
1412    are set.  If a code which should never appear in ISO2022 is found,
1413    returns 0.
1414
1415    If *latin_extra_code_state is zero and Latin extra codes are found,
1416    set *latin_extra_code_state to 1 and return 0.  If it is nonzero,
1417    accept Latin extra codes.  */
1418
1419 static int
1420 detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
1421      unsigned char *src, *src_end;
1422      int multibytep;
1423      int *latin_extra_code_state;
1424 {
1425   int mask = CODING_CATEGORY_MASK_ISO;
1426   int mask_found = 0;
1427   int reg[4], shift_out = 0, single_shifting = 0;
1428   int c, c1, charset;
1429   /* Dummy for ONE_MORE_BYTE.  */
1430   struct coding_system dummy_coding;
1431   struct coding_system *coding = &dummy_coding;
1432   Lisp_Object safe_chars;
1433
1434   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1435   while (mask)
1436     {
1437       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1438     retry:
1439       switch (c)
1440         {
1441         case ISO_CODE_ESC:
1442           if (inhibit_iso_escape_detection)
1443             break;
1444           single_shifting = 0;
1445           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1446           if (c >= '(' && c <= '/')
1447             {
1448               /* Designation sequence for a charset of dimension 1.  */
1449               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1450               if (c1 < ' ' || c1 >= 0x80
1451                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1452                 /* Invalid designation sequence.  Just ignore.  */
1453                 break;
1454               reg[(c - '(') % 4] = charset;
1455             }
1456           else if (c == '$')
1457             {
1458               /* Designation sequence for a charset of dimension 2.  */
1459               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1460               if (c >= '@' && c <= 'B')
1461                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1462                 reg[0] = charset = iso_charset_table[1][0][c];
1463               else if (c >= '(' && c <= '/')
1464                 {
1465                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1466                                                  mask & mask_found);
1467                   if (c1 < ' ' || c1 >= 0x80
1468                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1469                     /* Invalid designation sequence.  Just ignore.  */
1470                     break;
1471                   reg[(c - '(') % 4] = charset;
1472                 }
1473               else
1474                 /* Invalid designation sequence.  Just ignore.  */
1475                 break;
1476             }
1477           else if (c == 'N' || c == 'O')
1478             {
1479               /* ESC <Fe> for SS2 or SS3.  */
1480               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1481               break;
1482             }
1483           else if (c >= '0' && c <= '4')
1484             {
1485               /* ESC <Fp> for start/end composition.  */
1486               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1487                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1488               else
1489                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1490               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1491                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1492               else
1493                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1494               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1495                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1496               else
1497                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1498               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1499                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1500               else
1501                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1502               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1503                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1504               else
1505                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1506               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1507                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1508               else
1509                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1510               break;
1511             }
1512           else
1513             /* Invalid escape sequence.  Just ignore.  */
1514             break;
1515
1516           /* We found a valid designation sequence for CHARSET.  */
1517           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1518           c = MAKE_CHAR (charset, 0, 0);
1519           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1520             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1521           else
1522             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1523           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1524             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1525           else
1526             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1527           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1528             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1529           else
1530             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1531           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1532             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1533           else
1534             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1535           break;
1536
1537         case ISO_CODE_SO:
1538           if (inhibit_iso_escape_detection)
1539             break;
1540           single_shifting = 0;
1541           if (shift_out == 0
1542               && (reg[1] >= 0
1543                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1544                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1545             {
1546               /* Locking shift out.  */
1547               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1548               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1549             }
1550           break;
1551
1552         case ISO_CODE_SI:
1553           if (inhibit_iso_escape_detection)
1554             break;
1555           single_shifting = 0;
1556           if (shift_out == 1)
1557             {
1558               /* Locking shift in.  */
1559               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1560               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1561             }
1562           break;
1563
1564         case ISO_CODE_CSI:
1565           single_shifting = 0;
1566         case ISO_CODE_SS2:
1567         case ISO_CODE_SS3:
1568           {
1569             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1570
1571             if (inhibit_iso_escape_detection)
1572               break;
1573             if (c != ISO_CODE_CSI)
1574               {
1575                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1576                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1577                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1578                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1579                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1580                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1581                 single_shifting = 1;
1582               }
1583             if (VECTORP (Vlatin_extra_code_table)
1584                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1585               {
1586                 if (! *latin_extra_code_state)
1587                   {
1588                     *latin_extra_code_state = 1;
1589                     return 0;
1590                   }
1591                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1592                     & CODING_FLAG_ISO_LATIN_EXTRA)
1593                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1594                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1595                     & CODING_FLAG_ISO_LATIN_EXTRA)
1596                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1597               }
1598             mask &= newmask;
1599             mask_found |= newmask;
1600           }
1601           break;
1602
1603         default:
1604           if (c < 0x80)
1605             {
1606               single_shifting = 0;
1607               break;
1608             }
1609           else if (c < 0xA0)
1610             {
1611               single_shifting = 0;
1612               if (VECTORP (Vlatin_extra_code_table)
1613                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1614                 {
1615                   int newmask = 0;
1616
1617                   if (! *latin_extra_code_state)
1618                     {
1619                       *latin_extra_code_state = 1;
1620                       return 0;
1621                     }
1622                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1623                       & CODING_FLAG_ISO_LATIN_EXTRA)
1624                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1625                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1626                       & CODING_FLAG_ISO_LATIN_EXTRA)
1627                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1628                   mask &= newmask;
1629                   mask_found |= newmask;
1630                 }
1631               else
1632                 return 0;
1633             }
1634           else
1635             {
1636               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1637                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1638               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1639               /* Check the length of succeeding codes of the range
1640                  0xA0..0FF.  If the byte length is odd, we exclude
1641                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1642                  when we are not single shifting.  */
1643               if (!single_shifting
1644                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1645                 {
1646                   int i = 1;
1647
1648                   c = -1;
1649                   while (src < src_end)
1650                     {
1651                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1652                                                      mask & mask_found);
1653                       if (c < 0xA0)
1654                         break;
1655                       i++;
1656                     }
1657
1658                   if (i & 1 && src < src_end)
1659                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1660                   else
1661                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1662                   if (c >= 0)
1663                     /* This means that we have read one extra byte.  */
1664                     goto retry;
1665                 }
1666             }
1667           break;
1668         }
1669     }
1670   return (mask & mask_found);
1671 }
1672
1673 /* Decode a character of which charset is CHARSET, the 1st position
1674    code is C1, the 2nd position code is C2, and return the decoded
1675    character code.  If the variable `translation_table' is non-nil,
1676    returned the translated code.  */
1677
1678 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1679   (NILP (translation_table)                     \
1680    ? MAKE_CHAR (charset, c1, c2)                \
1681    : translate_char (translation_table, -1, charset, c1, c2))
1682
1683 /* Set designation state into CODING.  */
1684 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1685   do {                                                                     \
1686     int charset, c;                                                        \
1687                                                                            \
1688     if (final_char < '0' || final_char >= 128)                             \
1689       goto label_invalid_code;                                             \
1690     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1691                                  make_number (chars),                      \
1692                                  make_number (final_char));                \
1693     c = MAKE_CHAR (charset, 0, 0);                                         \
1694     if (charset >= 0                                                       \
1695         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1696             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1697       {                                                                    \
1698         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1699             && reg == 0                                                    \
1700             && charset == CHARSET_ASCII)                                   \
1701           {                                                                \
1702             /* We should insert this designation sequence as is so         \
1703                that it is surely written back to a file.  */               \
1704             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1705             goto label_invalid_code;                                       \
1706           }                                                                \
1707         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1708         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1709             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1710           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1711         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1712       }                                                                    \
1713     else                                                                   \
1714       {                                                                    \
1715         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1716         goto label_invalid_code;                                           \
1717       }                                                                    \
1718   } while (0)
1719
1720 /* Allocate a memory block for storing information about compositions.
1721    The block is chained to the already allocated blocks.  */
1722
1723 void
1724 coding_allocate_composition_data (coding, char_offset)
1725      struct coding_system *coding;
1726      int char_offset;
1727 {
1728   struct composition_data *cmp_data
1729     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1730
1731   cmp_data->char_offset = char_offset;
1732   cmp_data->used = 0;
1733   cmp_data->prev = coding->cmp_data;
1734   cmp_data->next = NULL;
1735   if (coding->cmp_data)
1736     coding->cmp_data->next = cmp_data;
1737   coding->cmp_data = cmp_data;
1738   coding->cmp_data_start = 0;
1739   coding->composing = COMPOSITION_NO;
1740 }
1741
1742 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1743    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1744    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1745    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1746    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1747   */
1748
1749 #define DECODE_COMPOSITION_START(c1)                                       \
1750   do {                                                                     \
1751     if (coding->composing == COMPOSITION_DISABLED)                         \
1752       {                                                                    \
1753         *dst++ = ISO_CODE_ESC;                                             \
1754         *dst++ = c1 & 0x7f;                                                \
1755         coding->produced_char += 2;                                        \
1756       }                                                                    \
1757     else if (!COMPOSING_P (coding))                                        \
1758       {                                                                    \
1759         /* This is surely the start of a composition.  We must be sure     \
1760            that coding->cmp_data has enough space to store the             \
1761            information about the composition.  If not, terminate the       \
1762            current decoding loop, allocate one more memory block for       \
1763            coding->cmp_data in the caller, then start the decoding         \
1764            loop again.  We can't allocate memory here directly because     \
1765            it may cause buffer/string relocation.  */                      \
1766         if (!coding->cmp_data                                              \
1767             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1768                 >= COMPOSITION_DATA_SIZE))                                 \
1769           {                                                                \
1770             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1771             goto label_end_of_loop;                                        \
1772           }                                                                \
1773         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1774                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1775                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1776                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1777         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1778                                       coding->composing);                  \
1779         coding->composition_rule_follows = 0;                              \
1780       }                                                                    \
1781     else                                                                   \
1782       {                                                                    \
1783         /* We are already handling a composition.  If the method is        \
1784            the following two, the codes following the current escape       \
1785            sequence are actual characters stored in a buffer.  */          \
1786         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1787             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1788           {                                                                \
1789             coding->composing = COMPOSITION_RELATIVE;                      \
1790             coding->composition_rule_follows = 0;                          \
1791           }                                                                \
1792       }                                                                    \
1793   } while (0)
1794
1795 /* Handle composition end sequence ESC 1.  */
1796
1797 #define DECODE_COMPOSITION_END(c1)                                      \
1798   do {                                                                  \
1799     if (! COMPOSING_P (coding))                                         \
1800       {                                                                 \
1801         *dst++ = ISO_CODE_ESC;                                          \
1802         *dst++ = c1;                                                    \
1803         coding->produced_char += 2;                                     \
1804       }                                                                 \
1805     else                                                                \
1806       {                                                                 \
1807         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1808         coding->composing = COMPOSITION_NO;                             \
1809       }                                                                 \
1810   } while (0)
1811
1812 /* Decode a composition rule from the byte C1 (and maybe one more byte
1813    from SRC) and store one encoded composition rule in
1814    coding->cmp_data.  */
1815
1816 #define DECODE_COMPOSITION_RULE(c1)                                     \
1817   do {                                                                  \
1818     int rule = 0;                                                       \
1819     (c1) -= 32;                                                         \
1820     if (c1 < 81)                /* old format (before ver.21) */        \
1821       {                                                                 \
1822         int gref = (c1) / 9;                                            \
1823         int nref = (c1) % 9;                                            \
1824         if (gref == 4) gref = 10;                                       \
1825         if (nref == 4) nref = 10;                                       \
1826         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1827       }                                                                 \
1828     else if (c1 < 93)           /* new format (after ver.21) */         \
1829       {                                                                 \
1830         ONE_MORE_BYTE (c2);                                             \
1831         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1832       }                                                                 \
1833     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1834     coding->composition_rule_follows = 0;                               \
1835   } while (0)
1836
1837
1838 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1839
1840 static void
1841 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1842      struct coding_system *coding;
1843      const unsigned char *source;
1844      unsigned char *destination;
1845      int src_bytes, dst_bytes;
1846 {
1847   const unsigned char *src = source;
1848   const unsigned char *src_end = source + src_bytes;
1849   unsigned char *dst = destination;
1850   unsigned char *dst_end = destination + dst_bytes;
1851   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1852   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1853   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1854   /* SRC_BASE remembers the start position in source in each loop.
1855      The loop will be exited when there's not enough source code
1856      (within macro ONE_MORE_BYTE), or when there's not enough
1857      destination area to produce a character (within macro
1858      EMIT_CHAR).  */
1859   const unsigned char *src_base;
1860   int c, charset;
1861   Lisp_Object translation_table;
1862   Lisp_Object safe_chars;
1863
1864   safe_chars = coding_safe_chars (coding->symbol);
1865
1866   if (NILP (Venable_character_translation))
1867     translation_table = Qnil;
1868   else
1869     {
1870       translation_table = coding->translation_table_for_decode;
1871       if (NILP (translation_table))
1872         translation_table = Vstandard_translation_table_for_decode;
1873     }
1874
1875   coding->result = CODING_FINISH_NORMAL;
1876
1877   while (1)
1878     {
1879       int c1, c2 = 0;
1880
1881       src_base = src;
1882       ONE_MORE_BYTE (c1);
1883
1884       /* We produce no character or one character.  */
1885       switch (iso_code_class [c1])
1886         {
1887         case ISO_0x20_or_0x7F:
1888           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1889             {
1890               DECODE_COMPOSITION_RULE (c1);
1891               continue;
1892             }
1893           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1894             {
1895               /* This is SPACE or DEL.  */
1896               charset = CHARSET_ASCII;
1897               break;
1898             }
1899           /* This is a graphic character, we fall down ...  */
1900
1901         case ISO_graphic_plane_0:
1902           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1903             {
1904               DECODE_COMPOSITION_RULE (c1);
1905               continue;
1906             }
1907           charset = charset0;
1908           break;
1909
1910         case ISO_0xA0_or_0xFF:
1911           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1912               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1913             goto label_invalid_code;
1914           /* This is a graphic character, we fall down ... */
1915
1916         case ISO_graphic_plane_1:
1917           if (charset1 < 0)
1918             goto label_invalid_code;
1919           charset = charset1;
1920           break;
1921
1922         case ISO_control_0:
1923           if (COMPOSING_P (coding))
1924             DECODE_COMPOSITION_END ('1');
1925
1926           /* All ISO2022 control characters in this class have the
1927              same representation in Emacs internal format.  */
1928           if (c1 == '\n'
1929               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1930               && (coding->eol_type == CODING_EOL_CR
1931                   || coding->eol_type == CODING_EOL_CRLF))
1932             {
1933               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1934               goto label_end_of_loop;
1935             }
1936           charset = CHARSET_ASCII;
1937           break;
1938
1939         case ISO_control_1:
1940           if (COMPOSING_P (coding))
1941             DECODE_COMPOSITION_END ('1');
1942           goto label_invalid_code;
1943
1944         case ISO_carriage_return:
1945           if (COMPOSING_P (coding))
1946             DECODE_COMPOSITION_END ('1');
1947
1948           if (coding->eol_type == CODING_EOL_CR)
1949             c1 = '\n';
1950           else if (coding->eol_type == CODING_EOL_CRLF)
1951             {
1952               ONE_MORE_BYTE (c1);
1953               if (c1 != ISO_CODE_LF)
1954                 {
1955                   src--;
1956                   c1 = '\r';
1957                 }
1958             }
1959           charset = CHARSET_ASCII;
1960           break;
1961
1962         case ISO_shift_out:
1963           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1964               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1965             goto label_invalid_code;
1966           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1967           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1968           continue;
1969
1970         case ISO_shift_in:
1971           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1972             goto label_invalid_code;
1973           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1974           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1975           continue;
1976
1977         case ISO_single_shift_2_7:
1978         case ISO_single_shift_2:
1979           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1980             goto label_invalid_code;
1981           /* SS2 is handled as an escape sequence of ESC 'N' */
1982           c1 = 'N';
1983           goto label_escape_sequence;
1984
1985         case ISO_single_shift_3:
1986           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1987             goto label_invalid_code;
1988           /* SS2 is handled as an escape sequence of ESC 'O' */
1989           c1 = 'O';
1990           goto label_escape_sequence;
1991
1992         case ISO_control_sequence_introducer:
1993           /* CSI is handled as an escape sequence of ESC '[' ...  */
1994           c1 = '[';
1995           goto label_escape_sequence;
1996
1997         case ISO_escape:
1998           ONE_MORE_BYTE (c1);
1999         label_escape_sequence:
2000           /* Escape sequences handled by Emacs are invocation,
2001              designation, direction specification, and character
2002              composition specification.  */
2003           switch (c1)
2004             {
2005             case '&':           /* revision of following character set */
2006               ONE_MORE_BYTE (c1);
2007               if (!(c1 >= '@' && c1 <= '~'))
2008                 goto label_invalid_code;
2009               ONE_MORE_BYTE (c1);
2010               if (c1 != ISO_CODE_ESC)
2011                 goto label_invalid_code;
2012               ONE_MORE_BYTE (c1);
2013               goto label_escape_sequence;
2014
2015             case '$':           /* designation of 2-byte character set */
2016               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2017                 goto label_invalid_code;
2018               ONE_MORE_BYTE (c1);
2019               if (c1 >= '@' && c1 <= 'B')
2020                 {       /* designation of JISX0208.1978, GB2312.1980,
2021                            or JISX0208.1980 */
2022                   DECODE_DESIGNATION (0, 2, 94, c1);
2023                 }
2024               else if (c1 >= 0x28 && c1 <= 0x2B)
2025                 {       /* designation of DIMENSION2_CHARS94 character set */
2026                   ONE_MORE_BYTE (c2);
2027                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2028                 }
2029               else if (c1 >= 0x2C && c1 <= 0x2F)
2030                 {       /* designation of DIMENSION2_CHARS96 character set */
2031                   ONE_MORE_BYTE (c2);
2032                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2033                 }
2034               else
2035                 goto label_invalid_code;
2036               /* We must update these variables now.  */
2037               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2038               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2039               continue;
2040
2041             case 'n':           /* invocation of locking-shift-2 */
2042               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2043                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2044                 goto label_invalid_code;
2045               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2046               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2047               continue;
2048
2049             case 'o':           /* invocation of locking-shift-3 */
2050               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2051                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2052                 goto label_invalid_code;
2053               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2054               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2055               continue;
2056
2057             case 'N':           /* invocation of single-shift-2 */
2058               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2059                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2060                 goto label_invalid_code;
2061               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2062               ONE_MORE_BYTE (c1);
2063               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2064                 goto label_invalid_code;
2065               break;
2066
2067             case 'O':           /* invocation of single-shift-3 */
2068               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2069                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2070                 goto label_invalid_code;
2071               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2072               ONE_MORE_BYTE (c1);
2073               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2074                 goto label_invalid_code;
2075               break;
2076
2077             case '0': case '2': case '3': case '4': /* start composition */
2078               DECODE_COMPOSITION_START (c1);
2079               continue;
2080
2081             case '1':           /* end composition */
2082               DECODE_COMPOSITION_END (c1);
2083               continue;
2084
2085             case '[':           /* specification of direction */
2086               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2087                 goto label_invalid_code;
2088               /* For the moment, nested direction is not supported.
2089                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2090                  left-to-right, and nonzero means right-to-left.  */
2091               ONE_MORE_BYTE (c1);
2092               switch (c1)
2093                 {
2094                 case ']':       /* end of the current direction */
2095                   coding->mode &= ~CODING_MODE_DIRECTION;
2096
2097                 case '0':       /* end of the current direction */
2098                 case '1':       /* start of left-to-right direction */
2099                   ONE_MORE_BYTE (c1);
2100                   if (c1 == ']')
2101                     coding->mode &= ~CODING_MODE_DIRECTION;
2102                   else
2103                     goto label_invalid_code;
2104                   break;
2105
2106                 case '2':       /* start of right-to-left direction */
2107                   ONE_MORE_BYTE (c1);
2108                   if (c1 == ']')
2109                     coding->mode |= CODING_MODE_DIRECTION;
2110                   else
2111                     goto label_invalid_code;
2112                   break;
2113
2114                 default:
2115                   goto label_invalid_code;
2116                 }
2117               continue;
2118
2119             case '%':
2120               if (COMPOSING_P (coding))
2121                 DECODE_COMPOSITION_END ('1');
2122               ONE_MORE_BYTE (c1);
2123               if (c1 == '/')
2124                 {
2125                   /* CTEXT extended segment:
2126                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2127                      We keep these bytes as is for the moment.
2128                      They may be decoded by post-read-conversion.  */
2129                   int dim, M, L;
2130                   int size, required;
2131                   int produced_chars;
2132
2133                   ONE_MORE_BYTE (dim);
2134                   ONE_MORE_BYTE (M);
2135                   ONE_MORE_BYTE (L);
2136                   size = ((M - 128) * 128) + (L - 128);
2137                   required = 8 + size * 2;
2138                   if (dst + required > (dst_bytes ? dst_end : src))
2139                     goto label_end_of_loop;
2140                   *dst++ = ISO_CODE_ESC;
2141                   *dst++ = '%';
2142                   *dst++ = '/';
2143                   *dst++ = dim;
2144                   produced_chars = 4;
2145                   dst += CHAR_STRING (M, dst), produced_chars++;
2146                   dst += CHAR_STRING (L, dst), produced_chars++;
2147                   while (size-- > 0)
2148                     {
2149                       ONE_MORE_BYTE (c1);
2150                       dst += CHAR_STRING (c1, dst), produced_chars++;
2151                     }
2152                   coding->produced_char += produced_chars;
2153                 }
2154               else if (c1 == 'G')
2155                 {
2156                   unsigned char *d = dst;
2157                   int produced_chars;
2158
2159                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2160                      ESC % G --UTF-8-BYTES-- ESC % @
2161                      We keep these bytes as is for the moment.
2162                      They may be decoded by post-read-conversion.  */
2163                   if (d + 6 > (dst_bytes ? dst_end : src))
2164                     goto label_end_of_loop;
2165                   *d++ = ISO_CODE_ESC;
2166                   *d++ = '%';
2167                   *d++ = 'G';
2168                   produced_chars = 3;
2169                   while (d + 1 < (dst_bytes ? dst_end : src))
2170                     {
2171                       ONE_MORE_BYTE (c1);
2172                       if (c1 == ISO_CODE_ESC
2173                           && src + 1 < src_end
2174                           && src[0] == '%'
2175                           && src[1] == '@')
2176                         {
2177                           src += 2;
2178                           break;
2179                         }
2180                       d += CHAR_STRING (c1, d), produced_chars++;
2181                     }
2182                   if (d + 3 > (dst_bytes ? dst_end : src))
2183                     goto label_end_of_loop;
2184                   *d++ = ISO_CODE_ESC;
2185                   *d++ = '%';
2186                   *d++ = '@';
2187                   dst = d;
2188                   coding->produced_char += produced_chars + 3;
2189                 }
2190               else
2191                 goto label_invalid_code;
2192               continue;
2193
2194             default:
2195               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2196                 goto label_invalid_code;
2197               if (c1 >= 0x28 && c1 <= 0x2B)
2198                 {       /* designation of DIMENSION1_CHARS94 character set */
2199                   ONE_MORE_BYTE (c2);
2200                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2201                 }
2202               else if (c1 >= 0x2C && c1 <= 0x2F)
2203                 {       /* designation of DIMENSION1_CHARS96 character set */
2204                   ONE_MORE_BYTE (c2);
2205                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2206                 }
2207               else
2208                 goto label_invalid_code;
2209               /* We must update these variables now.  */
2210               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2211               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2212               continue;
2213             }
2214         }
2215
2216       /* Now we know CHARSET and 1st position code C1 of a character.
2217          Produce a multibyte sequence for that character while getting
2218          2nd position code C2 if necessary.  */
2219       if (CHARSET_DIMENSION (charset) == 2)
2220         {
2221           ONE_MORE_BYTE (c2);
2222           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2223             /* C2 is not in a valid range.  */
2224             goto label_invalid_code;
2225         }
2226       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2227       EMIT_CHAR (c);
2228       continue;
2229
2230     label_invalid_code:
2231       coding->errors++;
2232       if (COMPOSING_P (coding))
2233         DECODE_COMPOSITION_END ('1');
2234       src = src_base;
2235       c = *src++;
2236       if (! NILP (translation_table))
2237         c = translate_char (translation_table, c, 0, 0, 0);
2238       EMIT_CHAR (c);
2239     }
2240
2241  label_end_of_loop:
2242   coding->consumed = coding->consumed_char = src_base - source;
2243   coding->produced = dst - destination;
2244   return;
2245 }
2246
2247
2248 /* ISO2022 encoding stuff.  */
2249
2250 /*
2251    It is not enough to say just "ISO2022" on encoding, we have to
2252    specify more details.  In Emacs, each ISO2022 coding system
2253    variant has the following specifications:
2254         1. Initial designation to G0 through G3.
2255         2. Allows short-form designation?
2256         3. ASCII should be designated to G0 before control characters?
2257         4. ASCII should be designated to G0 at end of line?
2258         5. 7-bit environment or 8-bit environment?
2259         6. Use locking-shift?
2260         7. Use Single-shift?
2261    And the following two are only for Japanese:
2262         8. Use ASCII in place of JIS0201-1976-Roman?
2263         9. Use JISX0208-1983 in place of JISX0208-1978?
2264    These specifications are encoded in `coding->flags' as flag bits
2265    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2266    details.
2267 */
2268
2269 /* Produce codes (escape sequence) for designating CHARSET to graphic
2270    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2271    '@', 'A', or 'B' and the coding system CODING allows, produce
2272    designation sequence of short-form.  */
2273
2274 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2275   do {                                                                  \
2276     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2277     char *intermediate_char_94 = "()*+";                                \
2278     char *intermediate_char_96 = ",-./";                                \
2279     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2280                                                                         \
2281     if (revision < 255)                                                 \
2282       {                                                                 \
2283         *dst++ = ISO_CODE_ESC;                                          \
2284         *dst++ = '&';                                                   \
2285         *dst++ = '@' + revision;                                        \
2286       }                                                                 \
2287     *dst++ = ISO_CODE_ESC;                                              \
2288     if (CHARSET_DIMENSION (charset) == 1)                               \
2289       {                                                                 \
2290         if (CHARSET_CHARS (charset) == 94)                              \
2291           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2292         else                                                            \
2293           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2294       }                                                                 \
2295     else                                                                \
2296       {                                                                 \
2297         *dst++ = '$';                                                   \
2298         if (CHARSET_CHARS (charset) == 94)                              \
2299           {                                                             \
2300             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2301                 || reg != 0                                             \
2302                 || final_char < '@' || final_char > 'B')                \
2303               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2304           }                                                             \
2305         else                                                            \
2306           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2307       }                                                                 \
2308     *dst++ = final_char;                                                \
2309     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2310   } while (0)
2311
2312 /* The following two macros produce codes (control character or escape
2313    sequence) for ISO2022 single-shift functions (single-shift-2 and
2314    single-shift-3).  */
2315
2316 #define ENCODE_SINGLE_SHIFT_2                           \
2317   do {                                                  \
2318     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2319       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2320     else                                                \
2321       *dst++ = ISO_CODE_SS2;                            \
2322     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2323   } while (0)
2324
2325 #define ENCODE_SINGLE_SHIFT_3                           \
2326   do {                                                  \
2327     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2328       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2329     else                                                \
2330       *dst++ = ISO_CODE_SS3;                            \
2331     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2332   } while (0)
2333
2334 /* The following four macros produce codes (control character or
2335    escape sequence) for ISO2022 locking-shift functions (shift-in,
2336    shift-out, locking-shift-2, and locking-shift-3).  */
2337
2338 #define ENCODE_SHIFT_IN                         \
2339   do {                                          \
2340     *dst++ = ISO_CODE_SI;                       \
2341     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2342   } while (0)
2343
2344 #define ENCODE_SHIFT_OUT                        \
2345   do {                                          \
2346     *dst++ = ISO_CODE_SO;                       \
2347     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2348   } while (0)
2349
2350 #define ENCODE_LOCKING_SHIFT_2                  \
2351   do {                                          \
2352     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2353     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2354   } while (0)
2355
2356 #define ENCODE_LOCKING_SHIFT_3                  \
2357   do {                                          \
2358     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2359     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2360   } while (0)
2361
2362 /* Produce codes for a DIMENSION1 character whose character set is
2363    CHARSET and whose position-code is C1.  Designation and invocation
2364    sequences are also produced in advance if necessary.  */
2365
2366 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2367   do {                                                                  \
2368     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2369       {                                                                 \
2370         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2371           *dst++ = c1 & 0x7F;                                           \
2372         else                                                            \
2373           *dst++ = c1 | 0x80;                                           \
2374         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2375         break;                                                          \
2376       }                                                                 \
2377     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2378       {                                                                 \
2379         *dst++ = c1 & 0x7F;                                             \
2380         break;                                                          \
2381       }                                                                 \
2382     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2383       {                                                                 \
2384         *dst++ = c1 | 0x80;                                             \
2385         break;                                                          \
2386       }                                                                 \
2387     else                                                                \
2388       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2389          must invoke it, or, at first, designate it to some graphic     \
2390          register.  Then repeat the loop to actually produce the        \
2391          character.  */                                                 \
2392       dst = encode_invocation_designation (charset, coding, dst);       \
2393   } while (1)
2394
2395 /* Produce codes for a DIMENSION2 character whose character set is
2396    CHARSET and whose position-codes are C1 and C2.  Designation and
2397    invocation codes are also produced in advance if necessary.  */
2398
2399 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2400   do {                                                                  \
2401     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2402       {                                                                 \
2403         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2404           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2405         else                                                            \
2406           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2407         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2408         break;                                                          \
2409       }                                                                 \
2410     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2411       {                                                                 \
2412         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2413         break;                                                          \
2414       }                                                                 \
2415     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2416       {                                                                 \
2417         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2418         break;                                                          \
2419       }                                                                 \
2420     else                                                                \
2421       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2422          must invoke it, or, at first, designate it to some graphic     \
2423          register.  Then repeat the loop to actually produce the        \
2424          character.  */                                                 \
2425       dst = encode_invocation_designation (charset, coding, dst);       \
2426   } while (1)
2427
2428 #define ENCODE_ISO_CHARACTER(c)                                 \
2429   do {                                                          \
2430     int charset, c1, c2;                                        \
2431                                                                 \
2432     SPLIT_CHAR (c, charset, c1, c2);                            \
2433     if (CHARSET_DEFINED_P (charset))                            \
2434       {                                                         \
2435         if (CHARSET_DIMENSION (charset) == 1)                   \
2436           {                                                     \
2437             if (charset == CHARSET_ASCII                        \
2438                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2439               charset = charset_latin_jisx0201;                 \
2440             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2441           }                                                     \
2442         else                                                    \
2443           {                                                     \
2444             if (charset == charset_jisx0208                     \
2445                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2446               charset = charset_jisx0208_1978;                  \
2447             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2448           }                                                     \
2449       }                                                         \
2450     else                                                        \
2451       {                                                         \
2452         *dst++ = c1;                                            \
2453         if (c2 >= 0)                                            \
2454           *dst++ = c2;                                          \
2455       }                                                         \
2456   } while (0)
2457
2458
2459 /* Instead of encoding character C, produce one or two `?'s.  */
2460
2461 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2462   do {                                                          \
2463     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2464     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2465       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2466   } while (0)
2467
2468
2469 /* Produce designation and invocation codes at a place pointed by DST
2470    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2471    Return new DST.  */
2472
2473 unsigned char *
2474 encode_invocation_designation (charset, coding, dst)
2475      int charset;
2476      struct coding_system *coding;
2477      unsigned char *dst;
2478 {
2479   int reg;                      /* graphic register number */
2480
2481   /* At first, check designations.  */
2482   for (reg = 0; reg < 4; reg++)
2483     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2484       break;
2485
2486   if (reg >= 4)
2487     {
2488       /* CHARSET is not yet designated to any graphic registers.  */
2489       /* At first check the requested designation.  */
2490       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2491       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2492         /* Since CHARSET requests no special designation, designate it
2493            to graphic register 0.  */
2494         reg = 0;
2495
2496       ENCODE_DESIGNATION (charset, reg, coding);
2497     }
2498
2499   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2500       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2501     {
2502       /* Since the graphic register REG is not invoked to any graphic
2503          planes, invoke it to graphic plane 0.  */
2504       switch (reg)
2505         {
2506         case 0:                 /* graphic register 0 */
2507           ENCODE_SHIFT_IN;
2508           break;
2509
2510         case 1:                 /* graphic register 1 */
2511           ENCODE_SHIFT_OUT;
2512           break;
2513
2514         case 2:                 /* graphic register 2 */
2515           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2516             ENCODE_SINGLE_SHIFT_2;
2517           else
2518             ENCODE_LOCKING_SHIFT_2;
2519           break;
2520
2521         case 3:                 /* graphic register 3 */
2522           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2523             ENCODE_SINGLE_SHIFT_3;
2524           else
2525             ENCODE_LOCKING_SHIFT_3;
2526           break;
2527         }
2528     }
2529
2530   return dst;
2531 }
2532
2533 /* Produce 2-byte codes for encoded composition rule RULE.  */
2534
2535 #define ENCODE_COMPOSITION_RULE(rule)           \
2536   do {                                          \
2537     int gref, nref;                             \
2538     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2539     *dst++ = 32 + 81 + gref;                    \
2540     *dst++ = 32 + nref;                         \
2541   } while (0)
2542
2543 /* Produce codes for indicating the start of a composition sequence
2544    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2545    which specify information about the composition.  See the comment
2546    in coding.h for the format of DATA.  */
2547
2548 #define ENCODE_COMPOSITION_START(coding, data)                          \
2549   do {                                                                  \
2550     coding->composing = data[3];                                        \
2551     *dst++ = ISO_CODE_ESC;                                              \
2552     if (coding->composing == COMPOSITION_RELATIVE)                      \
2553       *dst++ = '0';                                                     \
2554     else                                                                \
2555       {                                                                 \
2556         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2557                   ? '3' : '4');                                         \
2558         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2559         coding->composition_rule_follows = 0;                           \
2560       }                                                                 \
2561   } while (0)
2562
2563 /* Produce codes for indicating the end of the current composition.  */
2564
2565 #define ENCODE_COMPOSITION_END(coding, data)                    \
2566   do {                                                          \
2567     *dst++ = ISO_CODE_ESC;                                      \
2568     *dst++ = '1';                                               \
2569     coding->cmp_data_start += data[0];                          \
2570     coding->composing = COMPOSITION_NO;                         \
2571     if (coding->cmp_data_start == coding->cmp_data->used        \
2572         && coding->cmp_data->next)                              \
2573       {                                                         \
2574         coding->cmp_data = coding->cmp_data->next;              \
2575         coding->cmp_data_start = 0;                             \
2576       }                                                         \
2577   } while (0)
2578
2579 /* Produce composition start sequence ESC 0.  Here, this sequence
2580    doesn't mean the start of a new composition but means that we have
2581    just produced components (alternate chars and composition rules) of
2582    the composition and the actual text follows in SRC.  */
2583
2584 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2585   do {                                          \
2586     *dst++ = ISO_CODE_ESC;                      \
2587     *dst++ = '0';                               \
2588     coding->composing = COMPOSITION_RELATIVE;   \
2589   } while (0)
2590
2591 /* The following three macros produce codes for indicating direction
2592    of text.  */
2593 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2594   do {                                                  \
2595     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2596       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2597     else                                                \
2598       *dst++ = ISO_CODE_CSI;                            \
2599   } while (0)
2600
2601 #define ENCODE_DIRECTION_R2L    \
2602   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2603
2604 #define ENCODE_DIRECTION_L2R    \
2605   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2606
2607 /* Produce codes for designation and invocation to reset the graphic
2608    planes and registers to initial state.  */
2609 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2610   do {                                                                      \
2611     int reg;                                                                \
2612     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2613       ENCODE_SHIFT_IN;                                                      \
2614     for (reg = 0; reg < 4; reg++)                                           \
2615       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2616           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2617               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2618         ENCODE_DESIGNATION                                                  \
2619           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2620   } while (0)
2621
2622 /* Produce designation sequences of charsets in the line started from
2623    SRC to a place pointed by DST, and return updated DST.
2624
2625    If the current block ends before any end-of-line, we may fail to
2626    find all the necessary designations.  */
2627
2628 static unsigned char *
2629 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2630      struct coding_system *coding;
2631      Lisp_Object translation_table;
2632      const unsigned char *src, *src_end;
2633      unsigned char *dst;
2634 {
2635   int charset, c, found = 0, reg;
2636   /* Table of charsets to be designated to each graphic register.  */
2637   int r[4];
2638
2639   for (reg = 0; reg < 4; reg++)
2640     r[reg] = -1;
2641
2642   while (found < 4)
2643     {
2644       ONE_MORE_CHAR (c);
2645       if (c == '\n')
2646         break;
2647
2648       charset = CHAR_CHARSET (c);
2649       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2650       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2651         {
2652           found++;
2653           r[reg] = charset;
2654         }
2655     }
2656
2657  label_end_of_loop:
2658   if (found)
2659     {
2660       for (reg = 0; reg < 4; reg++)
2661         if (r[reg] >= 0
2662             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2663           ENCODE_DESIGNATION (r[reg], reg, coding);
2664     }
2665
2666   return dst;
2667 }
2668
2669 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2670
2671 static void
2672 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2673      struct coding_system *coding;
2674      const unsigned char *source;
2675      unsigned char *destination;
2676      int src_bytes, dst_bytes;
2677 {
2678   const unsigned char *src = source;
2679   const unsigned char *src_end = source + src_bytes;
2680   unsigned char *dst = destination;
2681   unsigned char *dst_end = destination + dst_bytes;
2682   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2683      from DST_END to assure overflow checking is necessary only at the
2684      head of loop.  */
2685   unsigned char *adjusted_dst_end = dst_end - 19;
2686   /* SRC_BASE remembers the start position in source in each loop.
2687      The loop will be exited when there's not enough source text to
2688      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2689      there's not enough destination area to produce encoded codes
2690      (within macro EMIT_BYTES).  */
2691   const unsigned char *src_base;
2692   int c;
2693   Lisp_Object translation_table;
2694   Lisp_Object safe_chars;
2695
2696   if (coding->flags & CODING_FLAG_ISO_SAFE)
2697     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2698
2699   safe_chars = coding_safe_chars (coding->symbol);
2700
2701   if (NILP (Venable_character_translation))
2702     translation_table = Qnil;
2703   else
2704     {
2705       translation_table = coding->translation_table_for_encode;
2706       if (NILP (translation_table))
2707         translation_table = Vstandard_translation_table_for_encode;
2708     }
2709
2710   coding->consumed_char = 0;
2711   coding->errors = 0;
2712   while (1)
2713     {
2714       src_base = src;
2715
2716       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2717         {
2718           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2719           break;
2720         }
2721
2722       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2723           && CODING_SPEC_ISO_BOL (coding))
2724         {
2725           /* We have to produce designation sequences if any now.  */
2726           dst = encode_designation_at_bol (coding, translation_table,
2727                                            src, src_end, dst);
2728           CODING_SPEC_ISO_BOL (coding) = 0;
2729         }
2730
2731       /* Check composition start and end.  */
2732       if (coding->composing != COMPOSITION_DISABLED
2733           && coding->cmp_data_start < coding->cmp_data->used)
2734         {
2735           struct composition_data *cmp_data = coding->cmp_data;
2736           int *data = cmp_data->data + coding->cmp_data_start;
2737           int this_pos = cmp_data->char_offset + coding->consumed_char;
2738
2739           if (coding->composing == COMPOSITION_RELATIVE)
2740             {
2741               if (this_pos == data[2])
2742                 {
2743                   ENCODE_COMPOSITION_END (coding, data);
2744                   cmp_data = coding->cmp_data;
2745                   data = cmp_data->data + coding->cmp_data_start;
2746                 }
2747             }
2748           else if (COMPOSING_P (coding))
2749             {
2750               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2751               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2752                 /* We have consumed components of the composition.
2753                    What follows in SRC is the composition's base
2754                    text.  */
2755                 ENCODE_COMPOSITION_FAKE_START (coding);
2756               else
2757                 {
2758                   int c = cmp_data->data[coding->cmp_data_index++];
2759                   if (coding->composition_rule_follows)
2760                     {
2761                       ENCODE_COMPOSITION_RULE (c);
2762                       coding->composition_rule_follows = 0;
2763                     }
2764                   else
2765                     {
2766                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2767                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2768                         ENCODE_UNSAFE_CHARACTER (c);
2769                       else
2770                         ENCODE_ISO_CHARACTER (c);
2771                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2772                         coding->composition_rule_follows = 1;
2773                     }
2774                   continue;
2775                 }
2776             }
2777           if (!COMPOSING_P (coding))
2778             {
2779               if (this_pos == data[1])
2780                 {
2781                   ENCODE_COMPOSITION_START (coding, data);
2782                   continue;
2783                 }
2784             }
2785         }
2786
2787       ONE_MORE_CHAR (c);
2788
2789       /* Now encode the character C.  */
2790       if (c < 0x20 || c == 0x7F)
2791         {
2792           if (c == '\r')
2793             {
2794               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2795                 {
2796                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2797                     ENCODE_RESET_PLANE_AND_REGISTER;
2798                   *dst++ = c;
2799                   continue;
2800                 }
2801               /* fall down to treat '\r' as '\n' ...  */
2802               c = '\n';
2803             }
2804           if (c == '\n')
2805             {
2806               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2807                 ENCODE_RESET_PLANE_AND_REGISTER;
2808               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2809                 bcopy (coding->spec.iso2022.initial_designation,
2810                        coding->spec.iso2022.current_designation,
2811                        sizeof coding->spec.iso2022.initial_designation);
2812               if (coding->eol_type == CODING_EOL_LF
2813                   || coding->eol_type == CODING_EOL_UNDECIDED)
2814                 *dst++ = ISO_CODE_LF;
2815               else if (coding->eol_type == CODING_EOL_CRLF)
2816                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2817               else
2818                 *dst++ = ISO_CODE_CR;
2819               CODING_SPEC_ISO_BOL (coding) = 1;
2820             }
2821           else
2822             {
2823               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2824                 ENCODE_RESET_PLANE_AND_REGISTER;
2825               *dst++ = c;
2826             }
2827         }
2828       else if (ASCII_BYTE_P (c))
2829         ENCODE_ISO_CHARACTER (c);
2830       else if (SINGLE_BYTE_CHAR_P (c))
2831         {
2832           *dst++ = c;
2833           coding->errors++;
2834         }
2835       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2836                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2837         ENCODE_UNSAFE_CHARACTER (c);
2838       else
2839         ENCODE_ISO_CHARACTER (c);
2840
2841       coding->consumed_char++;
2842     }
2843
2844  label_end_of_loop:
2845   coding->consumed = src_base - source;
2846   coding->produced = coding->produced_char = dst - destination;
2847 }
2848
2849 \f
2850 /*** 4. SJIS and BIG5 handlers ***/
2851
2852 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2853    quite widely.  So, for the moment, Emacs supports them in the bare
2854    C code.  But, in the future, they may be supported only by CCL.  */
2855
2856 /* SJIS is a coding system encoding three character sets: ASCII, right
2857    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2858    as is.  A character of charset katakana-jisx0201 is encoded by
2859    "position-code + 0x80".  A character of charset japanese-jisx0208
2860    is encoded in 2-byte but two position-codes are divided and shifted
2861    so that it fits in the range below.
2862
2863    --- CODE RANGE of SJIS ---
2864    (character set)      (range)
2865    ASCII                0x00 .. 0x7F
2866    KATAKANA-JISX0201    0xA1 .. 0xDF
2867    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2868             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2869    -------------------------------
2870
2871 */
2872
2873 /* BIG5 is a coding system encoding two character sets: ASCII and
2874    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2875    character set and is encoded in two bytes.
2876
2877    --- CODE RANGE of BIG5 ---
2878    (character set)      (range)
2879    ASCII                0x00 .. 0x7F
2880    Big5 (1st byte)      0xA1 .. 0xFE
2881         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2882    --------------------------
2883
2884    Since the number of characters in Big5 is larger than maximum
2885    characters in Emacs' charset (96x96), it can't be handled as one
2886    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2887    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2888    contains frequently used characters and the latter contains less
2889    frequently used characters.  */
2890
2891 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2892    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2893    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2894    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2895
2896 /* Number of Big5 characters which have the same code in 1st byte.  */
2897 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2898
2899 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2900   do {                                                                  \
2901     unsigned int temp                                                   \
2902       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2903     if (b1 < 0xC9)                                                      \
2904       charset = charset_big5_1;                                         \
2905     else                                                                \
2906       {                                                                 \
2907         charset = charset_big5_2;                                       \
2908         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2909       }                                                                 \
2910     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2911     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2912   } while (0)
2913
2914 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2915   do {                                                                  \
2916     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2917     if (charset == charset_big5_2)                                      \
2918       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2919     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2920     b2 = temp % BIG5_SAME_ROW;                                          \
2921     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2922   } while (0)
2923
2924 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2925    Check if a text is encoded in SJIS.  If it is, return
2926    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2927
2928 static int
2929 detect_coding_sjis (src, src_end, multibytep)
2930      unsigned char *src, *src_end;
2931      int multibytep;
2932 {
2933   int c;
2934   /* Dummy for ONE_MORE_BYTE.  */
2935   struct coding_system dummy_coding;
2936   struct coding_system *coding = &dummy_coding;
2937
2938   while (1)
2939     {
2940       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2941       if (c < 0x80)
2942         continue;
2943       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2944         return 0;
2945       if (c <= 0x9F || c >= 0xE0)
2946         {
2947           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2948           if (c < 0x40 || c == 0x7F || c > 0xFC)
2949             return 0;
2950         }
2951     }
2952 }
2953
2954 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2955    Check if a text is encoded in BIG5.  If it is, return
2956    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2957
2958 static int
2959 detect_coding_big5 (src, src_end, multibytep)
2960      unsigned char *src, *src_end;
2961      int multibytep;
2962 {
2963   int c;
2964   /* Dummy for ONE_MORE_BYTE.  */
2965   struct coding_system dummy_coding;
2966   struct coding_system *coding = &dummy_coding;
2967
2968   while (1)
2969     {
2970       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2971       if (c < 0x80)
2972         continue;
2973       if (c < 0xA1 || c > 0xFE)
2974         return 0;
2975       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2976       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2977         return 0;
2978     }
2979 }
2980
2981 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2982    Check if a text is encoded in UTF-8.  If it is, return
2983    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2984
2985 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2986 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2987 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2988 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2989 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2990 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2991 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2992
2993 static int
2994 detect_coding_utf_8 (src, src_end, multibytep)
2995      unsigned char *src, *src_end;
2996      int multibytep;
2997 {
2998   unsigned char c;
2999   int seq_maybe_bytes;
3000   /* Dummy for ONE_MORE_BYTE.  */
3001   struct coding_system dummy_coding;
3002   struct coding_system *coding = &dummy_coding;
3003
3004   while (1)
3005     {
3006       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
3007       if (UTF_8_1_OCTET_P (c))
3008         continue;
3009       else if (UTF_8_2_OCTET_LEADING_P (c))
3010         seq_maybe_bytes = 1;
3011       else if (UTF_8_3_OCTET_LEADING_P (c))
3012         seq_maybe_bytes = 2;
3013       else if (UTF_8_4_OCTET_LEADING_P (c))
3014         seq_maybe_bytes = 3;
3015       else if (UTF_8_5_OCTET_LEADING_P (c))
3016         seq_maybe_bytes = 4;
3017       else if (UTF_8_6_OCTET_LEADING_P (c))
3018         seq_maybe_bytes = 5;
3019       else
3020         return 0;
3021
3022       do
3023         {
3024           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3025           if (!UTF_8_EXTRA_OCTET_P (c))
3026             return 0;
3027           seq_maybe_bytes--;
3028         }
3029       while (seq_maybe_bytes > 0);
3030     }
3031 }
3032
3033 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3034    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3035    Little Endian (otherwise).  If it is, return
3036    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3037    else return 0.  */
3038
3039 #define UTF_16_INVALID_P(val)   \
3040   (((val) == 0xFFFE)            \
3041    || ((val) == 0xFFFF))
3042
3043 #define UTF_16_HIGH_SURROGATE_P(val) \
3044   (((val) & 0xD800) == 0xD800)
3045
3046 #define UTF_16_LOW_SURROGATE_P(val) \
3047   (((val) & 0xDC00) == 0xDC00)
3048
3049 static int
3050 detect_coding_utf_16 (src, src_end, multibytep)
3051      unsigned char *src, *src_end;
3052      int multibytep;
3053 {
3054   unsigned char c1, c2;
3055   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3056   struct coding_system dummy_coding;
3057   struct coding_system *coding = &dummy_coding;
3058
3059   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3060   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3061
3062   if ((c1 == 0xFF) && (c2 == 0xFE))
3063     return CODING_CATEGORY_MASK_UTF_16_LE;
3064   else if ((c1 == 0xFE) && (c2 == 0xFF))
3065     return CODING_CATEGORY_MASK_UTF_16_BE;
3066   return 0;
3067 }
3068
3069 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3070    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3071
3072 static void
3073 decode_coding_sjis_big5 (coding, source, destination,
3074                          src_bytes, dst_bytes, sjis_p)
3075      struct coding_system *coding;
3076      const unsigned char *source;
3077      unsigned char  *destination;
3078      int src_bytes, dst_bytes;
3079      int sjis_p;
3080 {
3081   const unsigned char *src = source;
3082   const unsigned char *src_end = source + src_bytes;
3083   unsigned char *dst = destination;
3084   unsigned char *dst_end = destination + dst_bytes;
3085   /* SRC_BASE remembers the start position in source in each loop.
3086      The loop will be exited when there's not enough source code
3087      (within macro ONE_MORE_BYTE), or when there's not enough
3088      destination area to produce a character (within macro
3089      EMIT_CHAR).  */
3090   const unsigned char *src_base;
3091   Lisp_Object translation_table;
3092
3093   if (NILP (Venable_character_translation))
3094     translation_table = Qnil;
3095   else
3096     {
3097       translation_table = coding->translation_table_for_decode;
3098       if (NILP (translation_table))
3099         translation_table = Vstandard_translation_table_for_decode;
3100     }
3101
3102   coding->produced_char = 0;
3103   while (1)
3104     {
3105       int c, charset, c1, c2 = 0;
3106
3107       src_base = src;
3108       ONE_MORE_BYTE (c1);
3109
3110       if (c1 < 0x80)
3111         {
3112           charset = CHARSET_ASCII;
3113           if (c1 < 0x20)
3114             {
3115               if (c1 == '\r')
3116                 {
3117                   if (coding->eol_type == CODING_EOL_CRLF)
3118                     {
3119                       ONE_MORE_BYTE (c2);
3120                       if (c2 == '\n')
3121                         c1 = c2;
3122                       else
3123                         /* To process C2 again, SRC is subtracted by 1.  */
3124                         src--;
3125                     }
3126                   else if (coding->eol_type == CODING_EOL_CR)
3127                     c1 = '\n';
3128                 }
3129               else if (c1 == '\n'
3130                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3131                        && (coding->eol_type == CODING_EOL_CR
3132                            || coding->eol_type == CODING_EOL_CRLF))
3133                 {
3134                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3135                   goto label_end_of_loop;
3136                 }
3137             }
3138         }
3139       else
3140         {
3141           if (sjis_p)
3142             {
3143               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3144                 goto label_invalid_code;
3145               if (c1 <= 0x9F || c1 >= 0xE0)
3146                 {
3147                   /* SJIS -> JISX0208 */
3148                   ONE_MORE_BYTE (c2);
3149                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3150                     goto label_invalid_code;
3151                   DECODE_SJIS (c1, c2, c1, c2);
3152                   charset = charset_jisx0208;
3153                 }
3154               else
3155                 /* SJIS -> JISX0201-Kana */
3156                 charset = charset_katakana_jisx0201;
3157             }
3158           else
3159             {
3160               /* BIG5 -> Big5 */
3161               if (c1 < 0xA0 || c1 > 0xFE)
3162                 goto label_invalid_code;
3163               ONE_MORE_BYTE (c2);
3164               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3165                 goto label_invalid_code;
3166               DECODE_BIG5 (c1, c2, charset, c1, c2);
3167             }
3168         }
3169
3170       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3171       EMIT_CHAR (c);
3172       continue;
3173
3174     label_invalid_code:
3175       coding->errors++;
3176       src = src_base;
3177       c = *src++;
3178       EMIT_CHAR (c);
3179     }
3180
3181  label_end_of_loop:
3182   coding->consumed = coding->consumed_char = src_base - source;
3183   coding->produced = dst - destination;
3184   return;
3185 }
3186
3187 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3188    This function can encode charsets `ascii', `katakana-jisx0201',
3189    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3190    are sure that all these charsets are registered as official charset
3191    (i.e. do not have extended leading-codes).  Characters of other
3192    charsets are produced without any encoding.  If SJIS_P is 1, encode
3193    SJIS text, else encode BIG5 text.  */
3194
3195 static void
3196 encode_coding_sjis_big5 (coding, source, destination,
3197                          src_bytes, dst_bytes, sjis_p)
3198      struct coding_system *coding;
3199      unsigned char *source, *destination;
3200      int src_bytes, dst_bytes;
3201      int sjis_p;
3202 {
3203   unsigned char *src = source;
3204   unsigned char *src_end = source + src_bytes;
3205   unsigned char *dst = destination;
3206   unsigned char *dst_end = destination + dst_bytes;
3207   /* SRC_BASE remembers the start position in source in each loop.
3208      The loop will be exited when there's not enough source text to
3209      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3210      there's not enough destination area to produce encoded codes
3211      (within macro EMIT_BYTES).  */
3212   unsigned char *src_base;
3213   Lisp_Object translation_table;
3214
3215   if (NILP (Venable_character_translation))
3216     translation_table = Qnil;
3217   else
3218     {
3219       translation_table = coding->translation_table_for_encode;
3220       if (NILP (translation_table))
3221         translation_table = Vstandard_translation_table_for_encode;
3222     }
3223
3224   while (1)
3225     {
3226       int c, charset, c1, c2;
3227
3228       src_base = src;
3229       ONE_MORE_CHAR (c);
3230
3231       /* Now encode the character C.  */
3232       if (SINGLE_BYTE_CHAR_P (c))
3233         {
3234           switch (c)
3235             {
3236             case '\r':
3237               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3238                 {
3239                   EMIT_ONE_BYTE (c);
3240                   break;
3241                 }
3242               c = '\n';
3243             case '\n':
3244               if (coding->eol_type == CODING_EOL_CRLF)
3245                 {
3246                   EMIT_TWO_BYTES ('\r', c);
3247                   break;
3248                 }
3249               else if (coding->eol_type == CODING_EOL_CR)
3250                 c = '\r';
3251             default:
3252               EMIT_ONE_BYTE (c);
3253             }
3254         }
3255       else
3256         {
3257           SPLIT_CHAR (c, charset, c1, c2);
3258           if (sjis_p)
3259             {
3260               if (charset == charset_jisx0208
3261                   || charset == charset_jisx0208_1978)
3262                 {
3263                   ENCODE_SJIS (c1, c2, c1, c2);
3264                   EMIT_TWO_BYTES (c1, c2);
3265                 }
3266               else if (charset == charset_katakana_jisx0201)
3267                 EMIT_ONE_BYTE (c1 | 0x80);
3268               else if (charset == charset_latin_jisx0201)
3269                 EMIT_ONE_BYTE (c1);
3270               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3271                 {
3272                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3273                   if (CHARSET_WIDTH (charset) > 1)
3274                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3275                 }
3276               else
3277                 /* There's no way other than producing the internal
3278                    codes as is.  */
3279                 EMIT_BYTES (src_base, src);
3280             }
3281           else
3282             {
3283               if (charset == charset_big5_1 || charset == charset_big5_2)
3284                 {
3285                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3286                   EMIT_TWO_BYTES (c1, c2);
3287                 }
3288               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3289                 {
3290                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3291                   if (CHARSET_WIDTH (charset) > 1)
3292                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3293                 }
3294               else
3295                 /* There's no way other than producing the internal
3296                    codes as is.  */
3297                 EMIT_BYTES (src_base, src);
3298             }
3299         }
3300       coding->consumed_char++;
3301     }
3302
3303  label_end_of_loop:
3304   coding->consumed = src_base - source;
3305   coding->produced = coding->produced_char = dst - destination;
3306 }
3307
3308 \f
3309 /*** 5. CCL handlers ***/
3310
3311 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3312    Check if a text is encoded in a coding system of which
3313    encoder/decoder are written in CCL program.  If it is, return
3314    CODING_CATEGORY_MASK_CCL, else return 0.  */
3315
3316 static int
3317 detect_coding_ccl (src, src_end, multibytep)
3318      unsigned char *src, *src_end;
3319      int multibytep;
3320 {
3321   unsigned char *valid;
3322   int c;
3323   /* Dummy for ONE_MORE_BYTE.  */
3324   struct coding_system dummy_coding;
3325   struct coding_system *coding = &dummy_coding;
3326
3327   /* No coding system is assigned to coding-category-ccl.  */
3328   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3329     return 0;
3330
3331   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3332   while (1)
3333     {
3334       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3335       if (! valid[c])
3336         return 0;
3337     }
3338 }
3339
3340 \f
3341 /*** 6. End-of-line handlers ***/
3342
3343 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3344
3345 static void
3346 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3347      struct coding_system *coding;
3348      const unsigned char *source;
3349      unsigned char *destination;
3350      int src_bytes, dst_bytes;
3351 {
3352   const unsigned char *src = source;
3353   unsigned char *dst = destination;
3354   const unsigned char *src_end = src + src_bytes;
3355   unsigned char *dst_end = dst + dst_bytes;
3356   Lisp_Object translation_table;
3357   /* SRC_BASE remembers the start position in source in each loop.
3358      The loop will be exited when there's not enough source code
3359      (within macro ONE_MORE_BYTE), or when there's not enough
3360      destination area to produce a character (within macro
3361      EMIT_CHAR).  */
3362   const unsigned char *src_base;
3363   int c;
3364
3365   translation_table = Qnil;
3366   switch (coding->eol_type)
3367     {
3368     case CODING_EOL_CRLF:
3369       while (1)
3370         {
3371           src_base = src;
3372           ONE_MORE_BYTE (c);
3373           if (c == '\r')
3374             {
3375               ONE_MORE_BYTE (c);
3376               if (c != '\n')
3377                 {
3378                   src--;
3379                   c = '\r';
3380                 }
3381             }
3382           else if (c == '\n'
3383                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3384             {
3385               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3386               goto label_end_of_loop;
3387             }
3388           EMIT_CHAR (c);
3389         }
3390       break;
3391
3392     case CODING_EOL_CR:
3393       while (1)
3394         {
3395           src_base = src;
3396           ONE_MORE_BYTE (c);
3397           if (c == '\n')
3398             {
3399               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3400                 {
3401                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3402                   goto label_end_of_loop;
3403                 }
3404             }
3405           else if (c == '\r')
3406             c = '\n';
3407           EMIT_CHAR (c);
3408         }
3409       break;
3410
3411     default:                    /* no need for EOL handling */
3412       while (1)
3413         {
3414           src_base = src;
3415           ONE_MORE_BYTE (c);
3416           EMIT_CHAR (c);
3417         }
3418     }
3419
3420  label_end_of_loop:
3421   coding->consumed = coding->consumed_char = src_base - source;
3422   coding->produced = dst - destination;
3423   return;
3424 }
3425
3426 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3427    format of end-of-line according to `coding->eol_type'.  It also
3428    convert multibyte form 8-bit characters to unibyte if
3429    CODING->src_multibyte is nonzero.  If `coding->mode &
3430    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3431    also means end-of-line.  */
3432
3433 static void
3434 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3435      struct coding_system *coding;
3436      const unsigned char *source;
3437      unsigned char *destination;
3438      int src_bytes, dst_bytes;
3439 {
3440   const unsigned char *src = source;
3441   unsigned char *dst = destination;
3442   const unsigned char *src_end = src + src_bytes;
3443   unsigned char *dst_end = dst + dst_bytes;
3444   Lisp_Object translation_table;
3445   /* SRC_BASE remembers the start position in source in each loop.
3446      The loop will be exited when there's not enough source text to
3447      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3448      there's not enough destination area to produce encoded codes
3449      (within macro EMIT_BYTES).  */
3450   const unsigned char *src_base;
3451   unsigned char *tmp;
3452   int c;
3453   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3454
3455   translation_table = Qnil;
3456   if (coding->src_multibyte
3457       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3458     {
3459       src_end--;
3460       src_bytes--;
3461       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3462     }
3463
3464   if (coding->eol_type == CODING_EOL_CRLF)
3465     {
3466       while (src < src_end)
3467         {
3468           src_base = src;
3469           c = *src++;
3470           if (c >= 0x20)
3471             EMIT_ONE_BYTE (c);
3472           else if (c == '\n' || (c == '\r' && selective_display))
3473             EMIT_TWO_BYTES ('\r', '\n');
3474           else
3475             EMIT_ONE_BYTE (c);
3476         }
3477       src_base = src;
3478     label_end_of_loop:
3479       ;
3480     }
3481   else
3482     {
3483       if (!dst_bytes || src_bytes <= dst_bytes)
3484         {
3485           safe_bcopy (src, dst, src_bytes);
3486           src_base = src_end;
3487           dst += src_bytes;
3488         }
3489       else
3490         {
3491           if (coding->src_multibyte
3492               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3493             dst_bytes--;
3494           safe_bcopy (src, dst, dst_bytes);
3495           src_base = src + dst_bytes;
3496           dst = destination + dst_bytes;
3497           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3498         }
3499       if (coding->eol_type == CODING_EOL_CR)
3500         {
3501           for (tmp = destination; tmp < dst; tmp++)
3502             if (*tmp == '\n') *tmp = '\r';
3503         }
3504       else if (selective_display)
3505         {
3506           for (tmp = destination; tmp < dst; tmp++)
3507             if (*tmp == '\r') *tmp = '\n';
3508         }
3509     }
3510   if (coding->src_multibyte)
3511     dst = destination + str_as_unibyte (destination, dst - destination);
3512
3513   coding->consumed = src_base - source;
3514   coding->produced = dst - destination;
3515   coding->produced_char = coding->produced;
3516 }
3517
3518 \f
3519 /*** 7. C library functions ***/
3520
3521 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3522    has a property `coding-system'.  The value of this property is a
3523    vector of length 5 (called the coding-vector).  Among elements of
3524    this vector, the first (element[0]) and the fifth (element[4])
3525    carry important information for decoding/encoding.  Before
3526    decoding/encoding, this information should be set in fields of a
3527    structure of type `coding_system'.
3528
3529    The value of the property `coding-system' can be a symbol of another
3530    subsidiary coding-system.  In that case, Emacs gets coding-vector
3531    from that symbol.
3532
3533    `element[0]' contains information to be set in `coding->type'.  The
3534    value and its meaning is as follows:
3535
3536    0 -- coding_type_emacs_mule
3537    1 -- coding_type_sjis
3538    2 -- coding_type_iso2022
3539    3 -- coding_type_big5
3540    4 -- coding_type_ccl encoder/decoder written in CCL
3541    nil -- coding_type_no_conversion
3542    t -- coding_type_undecided (automatic conversion on decoding,
3543                                no-conversion on encoding)
3544
3545    `element[4]' contains information to be set in `coding->flags' and
3546    `coding->spec'.  The meaning varies by `coding->type'.
3547
3548    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3549    of length 32 (of which the first 13 sub-elements are used now).
3550    Meanings of these sub-elements are:
3551
3552    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3553         If the value is an integer of valid charset, the charset is
3554         assumed to be designated to graphic register N initially.
3555
3556         If the value is minus, it is a minus value of charset which
3557         reserves graphic register N, which means that the charset is
3558         not designated initially but should be designated to graphic
3559         register N just before encoding a character in that charset.
3560
3561         If the value is nil, graphic register N is never used on
3562         encoding.
3563
3564    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3565         Each value takes t or nil.  See the section ISO2022 of
3566         `coding.h' for more information.
3567
3568    If `coding->type' is `coding_type_big5', element[4] is t to denote
3569    BIG5-ETen or nil to denote BIG5-HKU.
3570
3571    If `coding->type' takes the other value, element[4] is ignored.
3572
3573    Emacs Lisp's coding systems also carry information about format of
3574    end-of-line in a value of property `eol-type'.  If the value is
3575    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3576    means CODING_EOL_CR.  If it is not integer, it should be a vector
3577    of subsidiary coding systems of which property `eol-type' has one
3578    of the above values.
3579
3580 */
3581
3582 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3583    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3584    is setup so that no conversion is necessary and return -1, else
3585    return 0.  */
3586
3587 int
3588 setup_coding_system (coding_system, coding)
3589      Lisp_Object coding_system;
3590      struct coding_system *coding;
3591 {
3592   Lisp_Object coding_spec, coding_type, eol_type, plist;
3593   Lisp_Object val;
3594
3595   /* At first, zero clear all members.  */
3596   bzero (coding, sizeof (struct coding_system));
3597
3598   /* Initialize some fields required for all kinds of coding systems.  */
3599   coding->symbol = coding_system;
3600   coding->heading_ascii = -1;
3601   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3602   coding->composing = COMPOSITION_DISABLED;
3603   coding->cmp_data = NULL;
3604
3605   if (NILP (coding_system))
3606     goto label_invalid_coding_system;
3607
3608   coding_spec = Fget (coding_system, Qcoding_system);
3609
3610   if (!VECTORP (coding_spec)
3611       || XVECTOR (coding_spec)->size != 5
3612       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3613     goto label_invalid_coding_system;
3614
3615   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3616   if (VECTORP (eol_type))
3617     {
3618       coding->eol_type = CODING_EOL_UNDECIDED;
3619       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3620       if (system_eol_type != CODING_EOL_LF)
3621         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3622     }
3623   else if (XFASTINT (eol_type) == 1)
3624     {
3625       coding->eol_type = CODING_EOL_CRLF;
3626       coding->common_flags
3627         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3628     }
3629   else if (XFASTINT (eol_type) == 2)
3630     {
3631       coding->eol_type = CODING_EOL_CR;
3632       coding->common_flags
3633         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3634     }
3635   else
3636     {
3637       coding->common_flags = 0;
3638       coding->eol_type = CODING_EOL_LF;
3639     }
3640
3641   coding_type = XVECTOR (coding_spec)->contents[0];
3642   /* Try short cut.  */
3643   if (SYMBOLP (coding_type))
3644     {
3645       if (EQ (coding_type, Qt))
3646         {
3647           coding->type = coding_type_undecided;
3648           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3649         }
3650       else
3651         coding->type = coding_type_no_conversion;
3652       /* Initialize this member.  Any thing other than
3653          CODING_CATEGORY_IDX_UTF_16_BE and
3654          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3655          special treatment in detect_eol.  */
3656       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3657
3658       return 0;
3659     }
3660
3661   /* Get values of coding system properties:
3662      `post-read-conversion', `pre-write-conversion',
3663      `translation-table-for-decode', `translation-table-for-encode'.  */
3664   plist = XVECTOR (coding_spec)->contents[3];
3665   /* Pre & post conversion functions should be disabled if
3666      inhibit_eol_conversion is nonzero.  This is the case that a code
3667      conversion function is called while those functions are running.  */
3668   if (! inhibit_pre_post_conversion)
3669     {
3670       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3671       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3672     }
3673   val = Fplist_get (plist, Qtranslation_table_for_decode);
3674   if (SYMBOLP (val))
3675     val = Fget (val, Qtranslation_table_for_decode);
3676   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3677   val = Fplist_get (plist, Qtranslation_table_for_encode);
3678   if (SYMBOLP (val))
3679     val = Fget (val, Qtranslation_table_for_encode);
3680   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3681   val = Fplist_get (plist, Qcoding_category);
3682   if (!NILP (val))
3683     {
3684       val = Fget (val, Qcoding_category_index);
3685       if (INTEGERP (val))
3686         coding->category_idx = XINT (val);
3687       else
3688         goto label_invalid_coding_system;
3689     }
3690   else
3691     goto label_invalid_coding_system;
3692
3693   /* If the coding system has non-nil `composition' property, enable
3694      composition handling.  */
3695   val = Fplist_get (plist, Qcomposition);
3696   if (!NILP (val))
3697     coding->composing = COMPOSITION_NO;
3698
3699   /* If the coding system is ascii-incompatible, record it in
3700      common_flags.   */
3701   val = Fplist_get (plist, Qascii_incompatible);
3702   if (! NILP (val))
3703     coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3704
3705   switch (XFASTINT (coding_type))
3706     {
3707     case 0:
3708       coding->type = coding_type_emacs_mule;
3709       coding->common_flags
3710         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3711       if (!NILP (coding->post_read_conversion))
3712         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3713       if (!NILP (coding->pre_write_conversion))
3714         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3715       break;
3716
3717     case 1:
3718       coding->type = coding_type_sjis;
3719       coding->common_flags
3720         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3721       break;
3722
3723     case 2:
3724       coding->type = coding_type_iso2022;
3725       coding->common_flags
3726         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3727       {
3728         Lisp_Object val, temp;
3729         Lisp_Object *flags;
3730         int i, charset, reg_bits = 0;
3731
3732         val = XVECTOR (coding_spec)->contents[4];
3733
3734         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3735           goto label_invalid_coding_system;
3736
3737         flags = XVECTOR (val)->contents;
3738         coding->flags
3739           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3740              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3741              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3742              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3743              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3744              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3745              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3746              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3747              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3748              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3749              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3750              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3751              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3752              );
3753
3754         /* Invoke graphic register 0 to plane 0.  */
3755         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3756         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3757         CODING_SPEC_ISO_INVOCATION (coding, 1)
3758           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3759         /* Not single shifting at first.  */
3760         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3761         /* Beginning of buffer should also be regarded as bol. */
3762         CODING_SPEC_ISO_BOL (coding) = 1;
3763
3764         for (charset = 0; charset <= MAX_CHARSET; charset++)
3765           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3766         val = Vcharset_revision_alist;
3767         while (CONSP (val))
3768           {
3769             charset = get_charset_id (Fcar_safe (XCAR (val)));
3770             if (charset >= 0
3771                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3772                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3773               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3774             val = XCDR (val);
3775           }
3776
3777         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3778            FLAGS[REG] can be one of below:
3779                 integer CHARSET: CHARSET occupies register I,
3780                 t: designate nothing to REG initially, but can be used
3781                   by any charsets,
3782                 list of integer, nil, or t: designate the first
3783                   element (if integer) to REG initially, the remaining
3784                   elements (if integer) is designated to REG on request,
3785                   if an element is t, REG can be used by any charsets,
3786                 nil: REG is never used.  */
3787         for (charset = 0; charset <= MAX_CHARSET; charset++)
3788           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3789             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3790         for (i = 0; i < 4; i++)
3791           {
3792             if ((INTEGERP (flags[i])
3793                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3794                 || (charset = get_charset_id (flags[i])) >= 0)
3795               {
3796                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3797                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3798               }
3799             else if (EQ (flags[i], Qt))
3800               {
3801                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3802                 reg_bits |= 1 << i;
3803                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3804               }
3805             else if (CONSP (flags[i]))
3806               {
3807                 Lisp_Object tail;
3808                 tail = flags[i];
3809
3810                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3811                 if ((INTEGERP (XCAR (tail))
3812                      && (charset = XINT (XCAR (tail)),
3813                          CHARSET_VALID_P (charset)))
3814                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3815                   {
3816                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3817                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3818                   }
3819                 else
3820                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3821                 tail = XCDR (tail);
3822                 while (CONSP (tail))
3823                   {
3824                     if ((INTEGERP (XCAR (tail))
3825                          && (charset = XINT (XCAR (tail)),
3826                              CHARSET_VALID_P (charset)))
3827                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3828                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3829                         = i;
3830                     else if (EQ (XCAR (tail), Qt))
3831                       reg_bits |= 1 << i;
3832                     tail = XCDR (tail);
3833                   }
3834               }
3835             else
3836               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3837
3838             CODING_SPEC_ISO_DESIGNATION (coding, i)
3839               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3840           }
3841
3842         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3843           {
3844             /* REG 1 can be used only by locking shift in 7-bit env.  */
3845             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3846               reg_bits &= ~2;
3847             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3848               /* Without any shifting, only REG 0 and 1 can be used.  */
3849               reg_bits &= 3;
3850           }
3851
3852         if (reg_bits)
3853           for (charset = 0; charset <= MAX_CHARSET; charset++)
3854             {
3855               if (CHARSET_DEFINED_P (charset)
3856                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3857                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3858                 {
3859                   /* There exist some default graphic registers to be
3860                      used by CHARSET.  */
3861
3862                   /* We had better avoid designating a charset of
3863                      CHARS96 to REG 0 as far as possible.  */
3864                   if (CHARSET_CHARS (charset) == 96)
3865                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3866                       = (reg_bits & 2
3867                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3868                   else
3869                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3870                       = (reg_bits & 1
3871                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3872                 }
3873             }
3874       }
3875       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3876       coding->spec.iso2022.last_invalid_designation_register = -1;
3877       break;
3878
3879     case 3:
3880       coding->type = coding_type_big5;
3881       coding->common_flags
3882         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3883       coding->flags
3884         = (NILP (XVECTOR (coding_spec)->contents[4])
3885            ? CODING_FLAG_BIG5_HKU
3886            : CODING_FLAG_BIG5_ETEN);
3887       break;
3888
3889     case 4:
3890       coding->type = coding_type_ccl;
3891       coding->common_flags
3892         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3893       {
3894         val = XVECTOR (coding_spec)->contents[4];
3895         if (! CONSP (val)
3896             || setup_ccl_program (&(coding->spec.ccl.decoder),
3897                                   XCAR (val)) < 0
3898             || setup_ccl_program (&(coding->spec.ccl.encoder),
3899                                   XCDR (val)) < 0)
3900           goto label_invalid_coding_system;
3901
3902         bzero (coding->spec.ccl.valid_codes, 256);
3903         val = Fplist_get (plist, Qvalid_codes);
3904         if (CONSP (val))
3905           {
3906             Lisp_Object this;
3907
3908             for (; CONSP (val); val = XCDR (val))
3909               {
3910                 this = XCAR (val);
3911                 if (INTEGERP (this)
3912                     && XINT (this) >= 0 && XINT (this) < 256)
3913                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3914                 else if (CONSP (this)
3915                          && INTEGERP (XCAR (this))
3916                          && INTEGERP (XCDR (this)))
3917                   {
3918                     int start = XINT (XCAR (this));
3919                     int end = XINT (XCDR (this));
3920
3921                     if (start >= 0 && start <= end && end < 256)
3922                       while (start <= end)
3923                         coding->spec.ccl.valid_codes[start++] = 1;
3924                   }
3925               }
3926           }
3927       }
3928       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3929       coding->spec.ccl.cr_carryover = 0;
3930       coding->spec.ccl.eight_bit_carryover[0] = 0;
3931       break;
3932
3933     case 5:
3934       coding->type = coding_type_raw_text;
3935       break;
3936
3937     default:
3938       goto label_invalid_coding_system;
3939     }
3940   return 0;
3941
3942  label_invalid_coding_system:
3943   coding->type = coding_type_no_conversion;
3944   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3945   coding->common_flags = 0;
3946   coding->eol_type = CODING_EOL_UNDECIDED;
3947   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3948   return NILP (coding_system) ? 0 : -1;
3949 }
3950
3951 /* Free memory blocks allocated for storing composition information.  */
3952
3953 void
3954 coding_free_composition_data (coding)
3955      struct coding_system *coding;
3956 {
3957   struct composition_data *cmp_data = coding->cmp_data, *next;
3958
3959   if (!cmp_data)
3960     return;
3961   /* Memory blocks are chained.  At first, rewind to the first, then,
3962      free blocks one by one.  */
3963   while (cmp_data->prev)
3964     cmp_data = cmp_data->prev;
3965   while (cmp_data)
3966     {
3967       next = cmp_data->next;
3968       xfree (cmp_data);
3969       cmp_data = next;
3970     }
3971   coding->cmp_data = NULL;
3972 }
3973
3974 /* Set `char_offset' member of all memory blocks pointed by
3975    coding->cmp_data to POS.  */
3976
3977 void
3978 coding_adjust_composition_offset (coding, pos)
3979      struct coding_system *coding;
3980      int pos;
3981 {
3982   struct composition_data *cmp_data;
3983
3984   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3985     cmp_data->char_offset = pos;
3986 }
3987
3988 /* Setup raw-text or one of its subsidiaries in the structure
3989    coding_system CODING according to the already setup value eol_type
3990    in CODING.  CODING should be setup for some coding system in
3991    advance.  */
3992
3993 void
3994 setup_raw_text_coding_system (coding)
3995      struct coding_system *coding;
3996 {
3997   if (coding->type != coding_type_raw_text)
3998     {
3999       coding->symbol = Qraw_text;
4000       coding->type = coding_type_raw_text;
4001       if (coding->eol_type != CODING_EOL_UNDECIDED)
4002         {
4003           Lisp_Object subsidiaries;
4004           subsidiaries = Fget (Qraw_text, Qeol_type);
4005
4006           if (VECTORP (subsidiaries)
4007               && XVECTOR (subsidiaries)->size == 3)
4008             coding->symbol
4009               = XVECTOR (subsidiaries)->contents[coding->eol_type];
4010         }
4011       setup_coding_system (coding->symbol, coding);
4012     }
4013   return;
4014 }
4015
4016 /* Emacs has a mechanism to automatically detect a coding system if it
4017    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4018    it's impossible to distinguish some coding systems accurately
4019    because they use the same range of codes.  So, at first, coding
4020    systems are categorized into 7, those are:
4021
4022    o coding-category-emacs-mule
4023
4024         The category for a coding system which has the same code range
4025         as Emacs' internal format.  Assigned the coding-system (Lisp
4026         symbol) `emacs-mule' by default.
4027
4028    o coding-category-sjis
4029
4030         The category for a coding system which has the same code range
4031         as SJIS.  Assigned the coding-system (Lisp
4032         symbol) `japanese-shift-jis' by default.
4033
4034    o coding-category-iso-7
4035
4036         The category for a coding system which has the same code range
4037         as ISO2022 of 7-bit environment.  This doesn't use any locking
4038         shift and single shift functions.  This can encode/decode all
4039         charsets.  Assigned the coding-system (Lisp symbol)
4040         `iso-2022-7bit' by default.
4041
4042    o coding-category-iso-7-tight
4043
4044         Same as coding-category-iso-7 except that this can
4045         encode/decode only the specified charsets.
4046
4047    o coding-category-iso-8-1
4048
4049         The category for a coding system which has the same code range
4050         as ISO2022 of 8-bit environment and graphic plane 1 used only
4051         for DIMENSION1 charset.  This doesn't use any locking shift
4052         and single shift functions.  Assigned the coding-system (Lisp
4053         symbol) `iso-latin-1' by default.
4054
4055    o coding-category-iso-8-2
4056
4057         The category for a coding system which has the same code range
4058         as ISO2022 of 8-bit environment and graphic plane 1 used only
4059         for DIMENSION2 charset.  This doesn't use any locking shift
4060         and single shift functions.  Assigned the coding-system (Lisp
4061         symbol) `japanese-iso-8bit' by default.
4062
4063    o coding-category-iso-7-else
4064
4065         The category for a coding system which has the same code range
4066         as ISO2022 of 7-bit environment but uses locking shift or
4067         single shift functions.  Assigned the coding-system (Lisp
4068         symbol) `iso-2022-7bit-lock' by default.
4069
4070    o coding-category-iso-8-else
4071
4072         The category for a coding system which has the same code range
4073         as ISO2022 of 8-bit environment but uses locking shift or
4074         single shift functions.  Assigned the coding-system (Lisp
4075         symbol) `iso-2022-8bit-ss2' by default.
4076
4077    o coding-category-big5
4078
4079         The category for a coding system which has the same code range
4080         as BIG5.  Assigned the coding-system (Lisp symbol)
4081         `cn-big5' by default.
4082
4083    o coding-category-utf-8
4084
4085         The category for a coding system which has the same code range
4086         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4087         symbol) `utf-8' by default.
4088
4089    o coding-category-utf-16-be
4090
4091         The category for a coding system in which a text has an
4092         Unicode signature (cf. Unicode Standard) in the order of BIG
4093         endian at the head.  Assigned the coding-system (Lisp symbol)
4094         `utf-16-be' by default.
4095
4096    o coding-category-utf-16-le
4097
4098         The category for a coding system in which a text has an
4099         Unicode signature (cf. Unicode Standard) in the order of
4100         LITTLE endian at the head.  Assigned the coding-system (Lisp
4101         symbol) `utf-16-le' by default.
4102
4103    o coding-category-ccl
4104
4105         The category for a coding system of which encoder/decoder is
4106         written in CCL programs.  The default value is nil, i.e., no
4107         coding system is assigned.
4108
4109    o coding-category-binary
4110
4111         The category for a coding system not categorized in any of the
4112         above.  Assigned the coding-system (Lisp symbol)
4113         `no-conversion' by default.
4114
4115    Each of them is a Lisp symbol and the value is an actual
4116    `coding-system' (this is also a Lisp symbol) assigned by a user.
4117    What Emacs does actually is to detect a category of coding system.
4118    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4119    decide a single possible category, it selects a category of the
4120    highest priority.  Priorities of categories are also specified by a
4121    user in a Lisp variable `coding-category-list'.
4122
4123 */
4124
4125 static
4126 int ascii_skip_code[256];
4127
4128 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4129    If it detects possible coding systems, return an integer in which
4130    appropriate flag bits are set.  Flag bits are defined by macros
4131    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4132    it should point the table `coding_priorities'.  In that case, only
4133    the flag bit for a coding system of the highest priority is set in
4134    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4135    range 0x80..0x9F are in multibyte form.
4136
4137    How many ASCII characters are at the head is returned as *SKIP.  */
4138
4139 static int
4140 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4141      unsigned char *source;
4142      int src_bytes, *priorities, *skip;
4143      int multibytep;
4144 {
4145   register unsigned char c;
4146   unsigned char *src = source, *src_end = source + src_bytes;
4147   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4148   int i;
4149   int null_byte_found;
4150   int latin_extra_code_state = 1;
4151
4152   /* At first, skip all ASCII characters and control characters except
4153      for three ISO2022 specific control characters.  */
4154   ascii_skip_code[ISO_CODE_SO] = 0;
4155   ascii_skip_code[ISO_CODE_SI] = 0;
4156   ascii_skip_code[ISO_CODE_ESC] = 0;
4157
4158  label_loop_detect_coding:
4159   null_byte_found = 0;
4160   /* We stop this loop before the last byte because it may be a NULL
4161      anchor byte.  */
4162   while (src < src_end - 1 && ascii_skip_code[*src])
4163     null_byte_found |= (! *src++);
4164   if (ascii_skip_code[*src])
4165     src++;
4166   else if (! null_byte_found)
4167     {
4168       unsigned char *p = src + 1;
4169       while (p < src_end - 1)
4170         null_byte_found |= (! *p++);
4171     }
4172   *skip = src - source;
4173
4174   if (src >= src_end)
4175     /* We found nothing other than ASCII (and NULL byte).  There's
4176        nothing to do.  */
4177     return 0;
4178
4179   c = *src;
4180   /* The text seems to be encoded in some multilingual coding system.
4181      Now, try to find in which coding system the text is encoded.  */
4182   if (! null_byte_found && c < 0x80)
4183     {
4184       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4185       /* C is an ISO2022 specific control code of C0.  */
4186       latin_extra_code_state = 1;
4187       mask = detect_coding_iso2022 (src, src_end, multibytep,
4188                                     &latin_extra_code_state);
4189       if (mask == 0)
4190         {
4191           /* No valid ISO2022 code follows C.  Try again.  */
4192           src++;
4193           if (c == ISO_CODE_ESC)
4194             ascii_skip_code[ISO_CODE_ESC] = 1;
4195           else
4196             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4197           goto label_loop_detect_coding;
4198         }
4199       if (priorities)
4200         {
4201           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4202             {
4203               if (mask & priorities[i])
4204                 return priorities[i];
4205             }
4206           return CODING_CATEGORY_MASK_RAW_TEXT;
4207         }
4208     }
4209   else
4210     {
4211       int try;
4212
4213       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4214         c = src[1] - 0x20;
4215
4216       if (null_byte_found)
4217         {
4218           try = (CODING_CATEGORY_MASK_UTF_16_BE
4219                  | CODING_CATEGORY_MASK_UTF_16_LE);
4220         }
4221       else if (c < 0xA0)
4222         {
4223           /* C is the first byte of SJIS character code,
4224              or a leading-code of Emacs' internal format (emacs-mule),
4225              or the first byte of UTF-16.  */
4226           try = (CODING_CATEGORY_MASK_SJIS
4227                  | CODING_CATEGORY_MASK_EMACS_MULE
4228                  | CODING_CATEGORY_MASK_UTF_16_BE
4229                  | CODING_CATEGORY_MASK_UTF_16_LE);
4230
4231           /* Or, if C is a special latin extra code,
4232              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4233              or is an ISO2022 control-sequence-introducer (CSI),
4234              we should also consider the possibility of ISO2022 codings.  */
4235           if ((latin_extra_code_state
4236                && VECTORP (Vlatin_extra_code_table)
4237                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4238               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4239               || (c == ISO_CODE_CSI
4240                   && (src < src_end
4241                       && (*src == ']'
4242                           || ((*src == '0' || *src == '1' || *src == '2')
4243                               && src + 1 < src_end
4244                               && src[1] == ']')))))
4245             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4246                     | CODING_CATEGORY_MASK_ISO_8BIT);
4247         }
4248       else
4249         /* C is a character of ISO2022 in graphic plane right,
4250            or a SJIS's 1-byte character code (i.e. JISX0201),
4251            or the first byte of BIG5's 2-byte code,
4252            or the first byte of UTF-8/16.  */
4253         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4254                | CODING_CATEGORY_MASK_ISO_8BIT
4255                | CODING_CATEGORY_MASK_SJIS
4256                | CODING_CATEGORY_MASK_BIG5
4257                | CODING_CATEGORY_MASK_UTF_8
4258                | CODING_CATEGORY_MASK_UTF_16_BE
4259                | CODING_CATEGORY_MASK_UTF_16_LE);
4260
4261       /* Or, we may have to consider the possibility of CCL.  */
4262       if (! null_byte_found
4263           && coding_system_table[CODING_CATEGORY_IDX_CCL]
4264           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4265               ->spec.ccl.valid_codes)[c])
4266         try |= CODING_CATEGORY_MASK_CCL;
4267
4268       mask = 0;
4269       if (priorities)
4270         {
4271           /* At first try detection with Latin extra codes not-allowed.
4272              If no proper coding system is found because of Latin extra
4273              codes, try detection with Latin extra codes allowed.  */
4274           latin_extra_code_state = 0;
4275         label_retry:
4276           utf16_examined_p = iso2022_examined_p = 0;
4277           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4278             {
4279               if (!iso2022_examined_p
4280                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4281                 {
4282                   mask |= detect_coding_iso2022 (src, src_end, multibytep,
4283                                                  &latin_extra_code_state);
4284                   iso2022_examined_p = 1;
4285                 }
4286               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4287                 mask |= detect_coding_sjis (src, src_end, multibytep);
4288               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4289                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4290               else if (!utf16_examined_p
4291                        && (priorities[i] & try &
4292                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4293                 {
4294                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4295                   utf16_examined_p = 1;
4296                 }
4297               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4298                 mask |= detect_coding_big5 (src, src_end, multibytep);
4299               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4300                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4301               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4302                 mask |= detect_coding_ccl (src, src_end, multibytep);
4303               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4304                 {
4305                   if (latin_extra_code_state == 1)
4306                     {
4307                       /* Detection of ISO-2022 based coding system
4308                          failed because of Latin extra codes.  Before
4309                          falling back to raw-text, try again with
4310                          Latin extra codes allowed.  */
4311                       latin_extra_code_state = 2;
4312                       try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4313                              | CODING_CATEGORY_MASK_ISO_8BIT);
4314                       goto label_retry;
4315                     }
4316                   mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4317                 }
4318               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4319                 {
4320                   if (latin_extra_code_state == 1)
4321                     {
4322                       /* See the above comment.  */
4323                       latin_extra_code_state = 2;
4324                       try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4325                              | CODING_CATEGORY_MASK_ISO_8BIT);
4326                       goto label_retry;
4327                     }
4328                   mask |= CODING_CATEGORY_MASK_BINARY;
4329                 }
4330               if (mask & priorities[i])
4331                 return priorities[i];
4332             }
4333           return CODING_CATEGORY_MASK_RAW_TEXT;
4334         }
4335       if (try & CODING_CATEGORY_MASK_ISO)
4336         mask |= detect_coding_iso2022 (src, src_end, multibytep,
4337                                        &latin_extra_code_state);
4338       if (try & CODING_CATEGORY_MASK_SJIS)
4339         mask |= detect_coding_sjis (src, src_end, multibytep);
4340       if (try & CODING_CATEGORY_MASK_BIG5)
4341         mask |= detect_coding_big5 (src, src_end, multibytep);
4342       if (try & CODING_CATEGORY_MASK_UTF_8)
4343         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4344       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4345         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4346       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4347         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4348       if (try & CODING_CATEGORY_MASK_CCL)
4349         mask |= detect_coding_ccl (src, src_end, multibytep);
4350     }
4351   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4352 }
4353
4354 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4355    The information of the detected coding system is set in CODING.  */
4356
4357 void
4358 detect_coding (coding, src, src_bytes)
4359      struct coding_system *coding;
4360      const unsigned char *src;
4361      int src_bytes;
4362 {
4363   unsigned int idx;
4364   int skip, mask;
4365   Lisp_Object val;
4366
4367   val = Vcoding_category_list;
4368   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4369                              coding->src_multibyte);
4370   coding->heading_ascii = skip;
4371
4372   if (!mask) return;
4373
4374   /* We found a single coding system of the highest priority in MASK.  */
4375   idx = 0;
4376   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4377   if (! mask)
4378     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4379
4380   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4381
4382   if (coding->eol_type != CODING_EOL_UNDECIDED)
4383     {
4384       Lisp_Object tmp;
4385
4386       tmp = Fget (val, Qeol_type);
4387       if (VECTORP (tmp))
4388         val = XVECTOR (tmp)->contents[coding->eol_type];
4389     }
4390
4391   /* Setup this new coding system while preserving some slots.  */
4392   {
4393     int src_multibyte = coding->src_multibyte;
4394     int dst_multibyte = coding->dst_multibyte;
4395
4396     setup_coding_system (val, coding);
4397     coding->src_multibyte = src_multibyte;
4398     coding->dst_multibyte = dst_multibyte;
4399     coding->heading_ascii = skip;
4400   }
4401 }
4402
4403 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4404    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4405    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4406
4407    How many non-eol characters are at the head is returned as *SKIP.  */
4408
4409 #define MAX_EOL_CHECK_COUNT 3
4410
4411 static int
4412 detect_eol_type (source, src_bytes, skip)
4413      unsigned char *source;
4414      int src_bytes, *skip;
4415 {
4416   unsigned char *src = source, *src_end = src + src_bytes;
4417   unsigned char c;
4418   int total = 0;                /* How many end-of-lines are found so far.  */
4419   int eol_type = CODING_EOL_UNDECIDED;
4420   int this_eol_type;
4421
4422   *skip = 0;
4423
4424   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4425     {
4426       c = *src++;
4427       if (c == '\n' || c == '\r')
4428         {
4429           if (*skip == 0)
4430             *skip = src - 1 - source;
4431           total++;
4432           if (c == '\n')
4433             this_eol_type = CODING_EOL_LF;
4434           else if (src >= src_end || *src != '\n')
4435             this_eol_type = CODING_EOL_CR;
4436           else
4437             this_eol_type = CODING_EOL_CRLF, src++;
4438
4439           if (eol_type == CODING_EOL_UNDECIDED)
4440             /* This is the first end-of-line.  */
4441             eol_type = this_eol_type;
4442           else if (eol_type != this_eol_type)
4443             {
4444               /* The found type is different from what found before.  */
4445               eol_type = CODING_EOL_INCONSISTENT;
4446               break;
4447             }
4448         }
4449     }
4450
4451   if (*skip == 0)
4452     *skip = src_end - source;
4453   return eol_type;
4454 }
4455
4456 /* Like detect_eol_type, but detect EOL type in 2-octet
4457    big-endian/little-endian format for coding systems utf-16-be and
4458    utf-16-le.  */
4459
4460 static int
4461 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4462      unsigned char *source;
4463      int src_bytes, *skip, big_endian_p;
4464 {
4465   unsigned char *src = source, *src_end = src + src_bytes;
4466   unsigned int c1, c2;
4467   int total = 0;                /* How many end-of-lines are found so far.  */
4468   int eol_type = CODING_EOL_UNDECIDED;
4469   int this_eol_type;
4470   int msb, lsb;
4471
4472   if (big_endian_p)
4473     msb = 0, lsb = 1;
4474   else
4475     msb = 1, lsb = 0;
4476
4477   *skip = 0;
4478
4479   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4480     {
4481       c1 = (src[msb] << 8) | (src[lsb]);
4482       src += 2;
4483
4484       if (c1 == '\n' || c1 == '\r')
4485         {
4486           if (*skip == 0)
4487             *skip = src - 2 - source;
4488           total++;
4489           if (c1 == '\n')
4490             {
4491               this_eol_type = CODING_EOL_LF;
4492             }
4493           else
4494             {
4495               if ((src + 1) >= src_end)
4496                 {
4497                   this_eol_type = CODING_EOL_CR;
4498                 }
4499               else
4500                 {
4501                   c2 = (src[msb] << 8) | (src[lsb]);
4502                   if (c2 == '\n')
4503                     this_eol_type = CODING_EOL_CRLF, src += 2;
4504                   else
4505                     this_eol_type = CODING_EOL_CR;
4506                 }
4507             }
4508
4509           if (eol_type == CODING_EOL_UNDECIDED)
4510             /* This is the first end-of-line.  */
4511             eol_type = this_eol_type;
4512           else if (eol_type != this_eol_type)
4513             {
4514               /* The found type is different from what found before.  */
4515               eol_type = CODING_EOL_INCONSISTENT;
4516               break;
4517             }
4518         }
4519     }
4520
4521   if (*skip == 0)
4522     *skip = src_end - source;
4523   return eol_type;
4524 }
4525
4526 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4527    is encoded.  If it detects an appropriate format of end-of-line, it
4528    sets the information in *CODING.  */
4529
4530 void
4531 detect_eol (coding, src, src_bytes)
4532      struct coding_system *coding;
4533      const unsigned char *src;
4534      int src_bytes;
4535 {
4536   Lisp_Object val;
4537   int skip;
4538   int eol_type;
4539
4540   switch (coding->category_idx)
4541     {
4542     case CODING_CATEGORY_IDX_UTF_16_BE:
4543       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4544       break;
4545     case CODING_CATEGORY_IDX_UTF_16_LE:
4546       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4547       break;
4548     default:
4549       eol_type = detect_eol_type (src, src_bytes, &skip);
4550       break;
4551     }
4552
4553   if (coding->heading_ascii > skip)
4554     coding->heading_ascii = skip;
4555   else
4556     skip = coding->heading_ascii;
4557
4558   if (eol_type == CODING_EOL_UNDECIDED)
4559     return;
4560   if (eol_type == CODING_EOL_INCONSISTENT)
4561     {
4562 #if 0
4563       /* This code is suppressed until we find a better way to
4564          distinguish raw text file and binary file.  */
4565
4566       /* If we have already detected that the coding is raw-text, the
4567          coding should actually be no-conversion.  */
4568       if (coding->type == coding_type_raw_text)
4569         {
4570           setup_coding_system (Qno_conversion, coding);
4571           return;
4572         }
4573       /* Else, let's decode only text code anyway.  */
4574 #endif /* 0 */
4575       eol_type = CODING_EOL_LF;
4576     }
4577
4578   val = Fget (coding->symbol, Qeol_type);
4579   if (VECTORP (val) && XVECTOR (val)->size == 3)
4580     {
4581       int src_multibyte = coding->src_multibyte;
4582       int dst_multibyte = coding->dst_multibyte;
4583       struct composition_data *cmp_data = coding->cmp_data;
4584
4585       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4586       coding->src_multibyte = src_multibyte;
4587       coding->dst_multibyte = dst_multibyte;
4588       coding->heading_ascii = skip;
4589       coding->cmp_data = cmp_data;
4590     }
4591 }
4592
4593 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4594
4595 #define DECODING_BUFFER_MAG(coding)                     \
4596   (coding->type == coding_type_iso2022                  \
4597    ? 3                                                  \
4598    : (coding->type == coding_type_ccl                   \
4599       ? coding->spec.ccl.decoder.buf_magnification      \
4600       : 2))
4601
4602 /* Return maximum size (bytes) of a buffer enough for decoding
4603    SRC_BYTES of text encoded in CODING.  */
4604
4605 int
4606 decoding_buffer_size (coding, src_bytes)
4607      struct coding_system *coding;
4608      int src_bytes;
4609 {
4610   return (src_bytes * DECODING_BUFFER_MAG (coding)
4611           + CONVERSION_BUFFER_EXTRA_ROOM);
4612 }
4613
4614 /* Return maximum size (bytes) of a buffer enough for encoding
4615    SRC_BYTES of text to CODING.  */
4616
4617 int
4618 encoding_buffer_size (coding, src_bytes)
4619      struct coding_system *coding;
4620      int src_bytes;
4621 {
4622   int magnification;
4623
4624   if (coding->type == coding_type_ccl)
4625     {
4626       magnification = coding->spec.ccl.encoder.buf_magnification;
4627       if (coding->eol_type == CODING_EOL_CRLF)
4628         magnification *= 2;
4629     }
4630   else if (CODING_REQUIRE_ENCODING (coding))
4631     magnification = 3;
4632   else
4633     magnification = 1;
4634
4635   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4636 }
4637
4638 /* Working buffer for code conversion.  */
4639 struct conversion_buffer
4640 {
4641   int size;                     /* size of data.  */
4642   int on_stack;                 /* 1 if allocated by alloca.  */
4643   unsigned char *data;
4644 };
4645
4646 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4647 #define allocate_conversion_buffer(buf, len)            \
4648   do {                                                  \
4649     if (len < MAX_ALLOCA)                               \
4650       {                                                 \
4651         buf.data = (unsigned char *) alloca (len);      \
4652         buf.on_stack = 1;                               \
4653       }                                                 \
4654     else                                                \
4655       {                                                 \
4656         buf.data = (unsigned char *) xmalloc (len);     \
4657         buf.on_stack = 0;                               \
4658       }                                                 \
4659     buf.size = len;                                     \
4660   } while (0)
4661
4662 /* Double the allocated memory for *BUF.  */
4663 static void
4664 extend_conversion_buffer (buf)
4665      struct conversion_buffer *buf;
4666 {
4667   if (buf->on_stack)
4668     {
4669       unsigned char *save = buf->data;
4670       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4671       bcopy (save, buf->data, buf->size);
4672       buf->on_stack = 0;
4673     }
4674   else
4675     {
4676       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4677     }
4678   buf->size *= 2;
4679 }
4680
4681 /* Free the allocated memory for BUF if it is not on stack.  */
4682 static void
4683 free_conversion_buffer (buf)
4684      struct conversion_buffer *buf;
4685 {
4686   if (!buf->on_stack)
4687     xfree (buf->data);
4688 }
4689
4690 int
4691 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4692      struct coding_system *coding;
4693      unsigned char *source, *destination;
4694      int src_bytes, dst_bytes, encodep;
4695 {
4696   struct ccl_program *ccl
4697     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4698   unsigned char *dst = destination;
4699
4700   ccl->suppress_error = coding->suppress_error;
4701   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4702   if (encodep)
4703     {
4704       /* On encoding, EOL format is converted within ccl_driver.  For
4705          that, setup proper information in the structure CCL.  */
4706       ccl->eol_type = coding->eol_type;
4707       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4708         ccl->eol_type = CODING_EOL_LF;
4709       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4710       ccl->eight_bit_control = coding->dst_multibyte;
4711     }
4712   else
4713     ccl->eight_bit_control = 1;
4714   ccl->multibyte = coding->src_multibyte;
4715   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4716     {
4717       /* Move carryover bytes to DESTINATION.  */
4718       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4719       while (*p)
4720         *dst++ = *p++;
4721       coding->spec.ccl.eight_bit_carryover[0] = 0;
4722       if (dst_bytes)
4723         dst_bytes -= dst - destination;
4724     }
4725
4726   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4727                                   &(coding->consumed))
4728                       + dst - destination);
4729
4730   if (encodep)
4731     {
4732       coding->produced_char = coding->produced;
4733       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4734     }
4735   else if (!ccl->eight_bit_control)
4736     {
4737       /* The produced bytes forms a valid multibyte sequence. */
4738       coding->produced_char
4739         = multibyte_chars_in_text (destination, coding->produced);
4740       coding->spec.ccl.eight_bit_carryover[0] = 0;
4741     }
4742   else
4743     {
4744       /* On decoding, the destination should always multibyte.  But,
4745          CCL program might have been generated an invalid multibyte
4746          sequence.  Here we make such a sequence valid as
4747          multibyte.  */
4748       int bytes
4749         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4750
4751       if ((coding->consumed < src_bytes
4752            || !ccl->last_block)
4753           && coding->produced >= 1
4754           && destination[coding->produced - 1] >= 0x80)
4755         {
4756           /* We should not convert the tailing 8-bit codes to
4757              multibyte form even if they doesn't form a valid
4758              multibyte sequence.  They may form a valid sequence in
4759              the next call.  */
4760           int carryover = 0;
4761
4762           if (destination[coding->produced - 1] < 0xA0)
4763             carryover = 1;
4764           else if (coding->produced >= 2)
4765             {
4766               if (destination[coding->produced - 2] >= 0x80)
4767                 {
4768                   if (destination[coding->produced - 2] < 0xA0)
4769                     carryover = 2;
4770                   else if (coding->produced >= 3
4771                            && destination[coding->produced - 3] >= 0x80
4772                            && destination[coding->produced - 3] < 0xA0)
4773                     carryover = 3;
4774                 }
4775             }
4776           if (carryover > 0)
4777             {
4778               BCOPY_SHORT (destination + coding->produced - carryover,
4779                            coding->spec.ccl.eight_bit_carryover,
4780                            carryover);
4781               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4782               coding->produced -= carryover;
4783             }
4784         }
4785       coding->produced = str_as_multibyte (destination, bytes,
4786                                            coding->produced,
4787                                            &(coding->produced_char));
4788     }
4789
4790   switch (ccl->status)
4791     {
4792     case CCL_STAT_SUSPEND_BY_SRC:
4793       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4794       break;
4795     case CCL_STAT_SUSPEND_BY_DST:
4796       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4797       break;
4798     case CCL_STAT_QUIT:
4799     case CCL_STAT_INVALID_CMD:
4800       coding->result = CODING_FINISH_INTERRUPT;
4801       break;
4802     default:
4803       coding->result = CODING_FINISH_NORMAL;
4804       break;
4805     }
4806   return coding->result;
4807 }
4808
4809 /* Decode EOL format of the text at PTR of BYTES length destructively
4810    according to CODING->eol_type.  This is called after the CCL
4811    program produced a decoded text at PTR.  If we do CRLF->LF
4812    conversion, update CODING->produced and CODING->produced_char.  */
4813
4814 static void
4815 decode_eol_post_ccl (coding, ptr, bytes)
4816      struct coding_system *coding;
4817      unsigned char *ptr;
4818      int bytes;
4819 {
4820   Lisp_Object val, saved_coding_symbol;
4821   unsigned char *pend = ptr + bytes;
4822   int dummy;
4823
4824   /* Remember the current coding system symbol.  We set it back when
4825      an inconsistent EOL is found so that `last-coding-system-used' is
4826      set to the coding system that doesn't specify EOL conversion.  */
4827   saved_coding_symbol = coding->symbol;
4828
4829   coding->spec.ccl.cr_carryover = 0;
4830   if (coding->eol_type == CODING_EOL_UNDECIDED)
4831     {
4832       /* Here, to avoid the call of setup_coding_system, we directly
4833          call detect_eol_type.  */
4834       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4835       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4836         coding->eol_type = CODING_EOL_LF;
4837       if (coding->eol_type != CODING_EOL_UNDECIDED)
4838         {
4839           val = Fget (coding->symbol, Qeol_type);
4840           if (VECTORP (val) && XVECTOR (val)->size == 3)
4841             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4842         }
4843       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4844     }
4845
4846   if (coding->eol_type == CODING_EOL_LF
4847       || coding->eol_type == CODING_EOL_UNDECIDED)
4848     {
4849       /* We have nothing to do.  */
4850       ptr = pend;
4851     }
4852   else if (coding->eol_type == CODING_EOL_CRLF)
4853     {
4854       unsigned char *pstart = ptr, *p = ptr;
4855
4856       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4857           && *(pend - 1) == '\r')
4858         {
4859           /* If the last character is CR, we can't handle it here
4860              because LF will be in the not-yet-decoded source text.
4861              Record that the CR is not yet processed.  */
4862           coding->spec.ccl.cr_carryover = 1;
4863           coding->produced--;
4864           coding->produced_char--;
4865           pend--;
4866         }
4867       while (ptr < pend)
4868         {
4869           if (*ptr == '\r')
4870             {
4871               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4872                 {
4873                   *p++ = '\n';
4874                   ptr += 2;
4875                 }
4876               else
4877                 {
4878                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4879                     goto undo_eol_conversion;
4880                   *p++ = *ptr++;
4881                 }
4882             }
4883           else if (*ptr == '\n'
4884                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4885             goto undo_eol_conversion;
4886           else
4887             *p++ = *ptr++;
4888           continue;
4889
4890         undo_eol_conversion:
4891           /* We have faced with inconsistent EOL format at PTR.
4892              Convert all LFs before PTR back to CRLFs.  */
4893           for (p--, ptr--; p >= pstart; p--)
4894             {
4895               if (*p == '\n')
4896                 *ptr-- = '\n', *ptr-- = '\r';
4897               else
4898                 *ptr-- = *p;
4899             }
4900           /*  If carryover is recorded, cancel it because we don't
4901               convert CRLF anymore.  */
4902           if (coding->spec.ccl.cr_carryover)
4903             {
4904               coding->spec.ccl.cr_carryover = 0;
4905               coding->produced++;
4906               coding->produced_char++;
4907               pend++;
4908             }
4909           p = ptr = pend;
4910           coding->eol_type = CODING_EOL_LF;
4911           coding->symbol = saved_coding_symbol;
4912         }
4913       if (p < pend)
4914         {
4915           /* As each two-byte sequence CRLF was converted to LF, (PEND
4916              - P) is the number of deleted characters.  */
4917           coding->produced -= pend - p;
4918           coding->produced_char -= pend - p;
4919         }
4920     }
4921   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4922     {
4923       unsigned char *p = ptr;
4924
4925       for (; ptr < pend; ptr++)
4926         {
4927           if (*ptr == '\r')
4928             *ptr = '\n';
4929           else if (*ptr == '\n'
4930                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4931             {
4932               for (; p < ptr; p++)
4933                 {
4934                   if (*p == '\n')
4935                     *p = '\r';
4936                 }
4937               ptr = pend;
4938               coding->eol_type = CODING_EOL_LF;
4939               coding->symbol = saved_coding_symbol;
4940             }
4941         }
4942     }
4943 }
4944
4945 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4946    decoding, it may detect coding system and format of end-of-line if
4947    those are not yet decided.  The source should be unibyte, the
4948    result is multibyte if CODING->dst_multibyte is nonzero, else
4949    unibyte.  */
4950
4951 int
4952 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4953      struct coding_system *coding;
4954      const unsigned char *source;
4955      unsigned char *destination;
4956      int src_bytes, dst_bytes;
4957 {
4958   int extra = 0;
4959
4960   if (coding->type == coding_type_undecided)
4961     detect_coding (coding, source, src_bytes);
4962
4963   if (coding->eol_type == CODING_EOL_UNDECIDED
4964       && coding->type != coding_type_ccl)
4965     {
4966       detect_eol (coding, source, src_bytes);
4967       /* We had better recover the original eol format if we
4968          encounter an inconsistent eol format while decoding.  */
4969       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4970     }
4971
4972   coding->produced = coding->produced_char = 0;
4973   coding->consumed = coding->consumed_char = 0;
4974   coding->errors = 0;
4975   coding->result = CODING_FINISH_NORMAL;
4976
4977   switch (coding->type)
4978     {
4979     case coding_type_sjis:
4980       decode_coding_sjis_big5 (coding, source, destination,
4981                                src_bytes, dst_bytes, 1);
4982       break;
4983
4984     case coding_type_iso2022:
4985       decode_coding_iso2022 (coding, source, destination,
4986                              src_bytes, dst_bytes);
4987       break;
4988
4989     case coding_type_big5:
4990       decode_coding_sjis_big5 (coding, source, destination,
4991                                src_bytes, dst_bytes, 0);
4992       break;
4993
4994     case coding_type_emacs_mule:
4995       decode_coding_emacs_mule (coding, source, destination,
4996                                 src_bytes, dst_bytes);
4997       break;
4998
4999     case coding_type_ccl:
5000       if (coding->spec.ccl.cr_carryover)
5001         {
5002           /* Put the CR which was not processed by the previous call
5003              of decode_eol_post_ccl in DESTINATION.  It will be
5004              decoded together with the following LF by the call to
5005              decode_eol_post_ccl below.  */
5006           *destination = '\r';
5007           coding->produced++;
5008           coding->produced_char++;
5009           dst_bytes--;
5010           extra = coding->spec.ccl.cr_carryover;
5011         }
5012       ccl_coding_driver (coding, source, destination + extra,
5013                          src_bytes, dst_bytes, 0);
5014       if (coding->eol_type != CODING_EOL_LF)
5015         {
5016           coding->produced += extra;
5017           coding->produced_char += extra;
5018           decode_eol_post_ccl (coding, destination, coding->produced);
5019         }
5020       break;
5021
5022     default:
5023       decode_eol (coding, source, destination, src_bytes, dst_bytes);
5024     }
5025
5026   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5027       && coding->mode & CODING_MODE_LAST_BLOCK
5028       && coding->consumed == src_bytes)
5029     coding->result = CODING_FINISH_NORMAL;
5030
5031   if (coding->mode & CODING_MODE_LAST_BLOCK
5032       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5033     {
5034       const unsigned char *src = source + coding->consumed;
5035       unsigned char *dst = destination + coding->produced;
5036
5037       src_bytes -= coding->consumed;
5038       coding->errors++;
5039       if (COMPOSING_P (coding))
5040         DECODE_COMPOSITION_END ('1');
5041       while (src_bytes--)
5042         {
5043           int c = *src++;
5044           dst += CHAR_STRING (c, dst);
5045           coding->produced_char++;
5046         }
5047       coding->consumed = coding->consumed_char = src - source;
5048       coding->produced = dst - destination;
5049       coding->result = CODING_FINISH_NORMAL;
5050     }
5051
5052   if (!coding->dst_multibyte)
5053     {
5054       coding->produced = str_as_unibyte (destination, coding->produced);
5055       coding->produced_char = coding->produced;
5056     }
5057
5058   return coding->result;
5059 }
5060
5061 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
5062    multibyteness of the source is CODING->src_multibyte, the
5063    multibyteness of the result is always unibyte.  */
5064
5065 int
5066 encode_coding (coding, source, destination, src_bytes, dst_bytes)
5067      struct coding_system *coding;
5068      const unsigned char *source;
5069      unsigned char *destination;
5070      int src_bytes, dst_bytes;
5071 {
5072   coding->produced = coding->produced_char = 0;
5073   coding->consumed = coding->consumed_char = 0;
5074   coding->errors = 0;
5075   coding->result = CODING_FINISH_NORMAL;
5076   if (coding->eol_type == CODING_EOL_UNDECIDED)
5077     coding->eol_type = CODING_EOL_LF;
5078
5079   switch (coding->type)
5080     {
5081     case coding_type_sjis:
5082       encode_coding_sjis_big5 (coding, source, destination,
5083                                src_bytes, dst_bytes, 1);
5084       break;
5085
5086     case coding_type_iso2022:
5087       encode_coding_iso2022 (coding, source, destination,
5088                              src_bytes, dst_bytes);
5089       break;
5090
5091     case coding_type_big5:
5092       encode_coding_sjis_big5 (coding, source, destination,
5093                                src_bytes, dst_bytes, 0);
5094       break;
5095
5096     case coding_type_emacs_mule:
5097       encode_coding_emacs_mule (coding, source, destination,
5098                                 src_bytes, dst_bytes);
5099       break;
5100
5101     case coding_type_ccl:
5102       ccl_coding_driver (coding, source, destination,
5103                          src_bytes, dst_bytes, 1);
5104       break;
5105
5106     default:
5107       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5108     }
5109
5110   if (coding->mode & CODING_MODE_LAST_BLOCK
5111       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5112     {
5113       const unsigned char *src = source + coding->consumed;
5114       unsigned char *dst = destination + coding->produced;
5115
5116       if (coding->type == coding_type_iso2022)
5117         ENCODE_RESET_PLANE_AND_REGISTER;
5118       if (COMPOSING_P (coding))
5119         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5120       if (coding->consumed < src_bytes)
5121         {
5122           int len = src_bytes - coding->consumed;
5123
5124           BCOPY_SHORT (src, dst, len);
5125           if (coding->src_multibyte)
5126             len = str_as_unibyte (dst, len);
5127           dst += len;
5128           coding->consumed = src_bytes;
5129         }
5130       coding->produced = coding->produced_char = dst - destination;
5131       coding->result = CODING_FINISH_NORMAL;
5132     }
5133
5134   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5135       && coding->consumed == src_bytes)
5136     coding->result = CODING_FINISH_NORMAL;
5137
5138   return coding->result;
5139 }
5140
5141 /* Scan text in the region between *BEG and *END (byte positions),
5142    skip characters which we don't have to decode by coding system
5143    CODING at the head and tail, then set *BEG and *END to the region
5144    of the text we actually have to convert.  The caller should move
5145    the gap out of the region in advance if the region is from a
5146    buffer.
5147
5148    If STR is not NULL, *BEG and *END are indices into STR.  */
5149
5150 static void
5151 shrink_decoding_region (beg, end, coding, str)
5152      int *beg, *end;
5153      struct coding_system *coding;
5154      unsigned char *str;
5155 {
5156   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5157   int eol_conversion;
5158   Lisp_Object translation_table;
5159
5160   if (coding->type == coding_type_ccl
5161       || coding->type == coding_type_undecided
5162       || coding->eol_type != CODING_EOL_LF
5163       || !NILP (coding->post_read_conversion)
5164       || coding->composing != COMPOSITION_DISABLED)
5165     {
5166       /* We can't skip any data.  */
5167       return;
5168     }
5169   if (coding->type == coding_type_no_conversion
5170       || coding->type == coding_type_raw_text
5171       || coding->type == coding_type_emacs_mule)
5172     {
5173       /* We need no conversion, but don't have to skip any data here.
5174          Decoding routine handles them effectively anyway.  */
5175       return;
5176     }
5177
5178   translation_table = coding->translation_table_for_decode;
5179   if (NILP (translation_table) && !NILP (Venable_character_translation))
5180     translation_table = Vstandard_translation_table_for_decode;
5181   if (CHAR_TABLE_P (translation_table))
5182     {
5183       int i;
5184       for (i = 0; i < 128; i++)
5185         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5186           break;
5187       if (i < 128)
5188         /* Some ASCII character should be translated.  We give up
5189            shrinking.  */
5190         return;
5191     }
5192
5193   if (coding->heading_ascii >= 0)
5194     /* Detection routine has already found how much we can skip at the
5195        head.  */
5196     *beg += coding->heading_ascii;
5197
5198   if (str)
5199     {
5200       begp_orig = begp = str + *beg;
5201       endp_orig = endp = str + *end;
5202     }
5203   else
5204     {
5205       begp_orig = begp = BYTE_POS_ADDR (*beg);
5206       endp_orig = endp = begp + *end - *beg;
5207     }
5208
5209   eol_conversion = (coding->eol_type == CODING_EOL_CR
5210                     || coding->eol_type == CODING_EOL_CRLF);
5211
5212   switch (coding->type)
5213     {
5214     case coding_type_sjis:
5215     case coding_type_big5:
5216       /* We can skip all ASCII characters at the head.  */
5217       if (coding->heading_ascii < 0)
5218         {
5219           if (eol_conversion)
5220             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5221           else
5222             while (begp < endp && *begp < 0x80) begp++;
5223         }
5224       /* We can skip all ASCII characters at the tail except for the
5225          second byte of SJIS or BIG5 code.  */
5226       if (eol_conversion)
5227         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5228       else
5229         while (begp < endp && endp[-1] < 0x80) endp--;
5230       /* Do not consider LF as ascii if preceded by CR, since that
5231          confuses eol decoding. */
5232       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5233         endp++;
5234       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5235         endp++;
5236       break;
5237
5238     case coding_type_iso2022:
5239       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5240         /* We can't skip any data.  */
5241         break;
5242       if (coding->heading_ascii < 0)
5243         {
5244           /* We can skip all ASCII characters at the head except for a
5245              few control codes.  */
5246           while (begp < endp && (c = *begp) < 0x80
5247                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5248                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5249                  && (!eol_conversion || c != ISO_CODE_LF))
5250             begp++;
5251         }
5252       switch (coding->category_idx)
5253         {
5254         case CODING_CATEGORY_IDX_ISO_8_1:
5255         case CODING_CATEGORY_IDX_ISO_8_2:
5256           /* We can skip all ASCII characters at the tail.  */
5257           if (eol_conversion)
5258             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5259           else
5260             while (begp < endp && endp[-1] < 0x80) endp--;
5261           /* Do not consider LF as ascii if preceded by CR, since that
5262              confuses eol decoding. */
5263           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5264             endp++;
5265           break;
5266
5267         case CODING_CATEGORY_IDX_ISO_7:
5268         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5269           {
5270             /* We can skip all characters at the tail except for 8-bit
5271                codes and ESC and the following 2-byte at the tail.  */
5272             unsigned char *eight_bit = NULL;
5273
5274             if (eol_conversion)
5275               while (begp < endp
5276                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5277                 {
5278                   if (!eight_bit && c & 0x80) eight_bit = endp;
5279                   endp--;
5280                 }
5281             else
5282               while (begp < endp
5283                      && (c = endp[-1]) != ISO_CODE_ESC)
5284                 {
5285                   if (!eight_bit && c & 0x80) eight_bit = endp;
5286                   endp--;
5287                 }
5288             /* Do not consider LF as ascii if preceded by CR, since that
5289                confuses eol decoding. */
5290             if (begp < endp && endp < endp_orig
5291                 && endp[-1] == '\r' && endp[0] == '\n')
5292               endp++;
5293             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5294               {
5295                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5296                   /* This is an ASCII designation sequence.  We can
5297                      surely skip the tail.  But, if we have
5298                      encountered an 8-bit code, skip only the codes
5299                      after that.  */
5300                   endp = eight_bit ? eight_bit : endp + 2;
5301                 else
5302                   /* Hmmm, we can't skip the tail.  */
5303                   endp = endp_orig;
5304               }
5305             else if (eight_bit)
5306               endp = eight_bit;
5307           }
5308         }
5309       break;
5310
5311     default:
5312       abort ();
5313     }
5314   *beg += begp - begp_orig;
5315   *end += endp - endp_orig;
5316   return;
5317 }
5318
5319 /* Like shrink_decoding_region but for encoding.  */
5320
5321 static void
5322 shrink_encoding_region (beg, end, coding, str)
5323      int *beg, *end;
5324      struct coding_system *coding;
5325      unsigned char *str;
5326 {
5327   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5328   int eol_conversion;
5329   Lisp_Object translation_table;
5330
5331   if (coding->type == coding_type_ccl
5332       || coding->eol_type == CODING_EOL_CRLF
5333       || coding->eol_type == CODING_EOL_CR
5334       || (coding->cmp_data && coding->cmp_data->used > 0))
5335     {
5336       /* We can't skip any data.  */
5337       return;
5338     }
5339   if (coding->type == coding_type_no_conversion
5340       || coding->type == coding_type_raw_text
5341       || coding->type == coding_type_emacs_mule
5342       || coding->type == coding_type_undecided)
5343     {
5344       /* We need no conversion, but don't have to skip any data here.
5345          Encoding routine handles them effectively anyway.  */
5346       return;
5347     }
5348
5349   translation_table = coding->translation_table_for_encode;
5350   if (NILP (translation_table) && !NILP (Venable_character_translation))
5351     translation_table = Vstandard_translation_table_for_encode;
5352   if (CHAR_TABLE_P (translation_table))
5353     {
5354       int i;
5355       for (i = 0; i < 128; i++)
5356         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5357           break;
5358       if (i < 128)
5359         /* Some ASCII character should be translated.  We give up
5360            shrinking.  */
5361         return;
5362     }
5363
5364   if (str)
5365     {
5366       begp_orig = begp = str + *beg;
5367       endp_orig = endp = str + *end;
5368     }
5369   else
5370     {
5371       begp_orig = begp = BYTE_POS_ADDR (*beg);
5372       endp_orig = endp = begp + *end - *beg;
5373     }
5374
5375   eol_conversion = (coding->eol_type == CODING_EOL_CR
5376                     || coding->eol_type == CODING_EOL_CRLF);
5377
5378   /* Here, we don't have to check coding->pre_write_conversion because
5379      the caller is expected to have handled it already.  */
5380   switch (coding->type)
5381     {
5382     case coding_type_iso2022:
5383       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5384         /* We can't skip any data.  */
5385         break;
5386       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5387         {
5388           unsigned char *bol = begp;
5389           while (begp < endp && *begp < 0x80)
5390             {
5391               begp++;
5392               if (begp[-1] == '\n')
5393                 bol = begp;
5394             }
5395           begp = bol;
5396           goto label_skip_tail;
5397         }
5398       /* fall down ... */
5399
5400     case coding_type_sjis:
5401     case coding_type_big5:
5402       /* We can skip all ASCII characters at the head and tail.  */
5403       if (eol_conversion)
5404         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5405       else
5406         while (begp < endp && *begp < 0x80) begp++;
5407     label_skip_tail:
5408       if (eol_conversion)
5409         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5410       else
5411         while (begp < endp && *(endp - 1) < 0x80) endp--;
5412       break;
5413
5414     default:
5415       abort ();
5416     }
5417
5418   *beg += begp - begp_orig;
5419   *end += endp - endp_orig;
5420   return;
5421 }
5422
5423 /* As shrinking conversion region requires some overhead, we don't try
5424    shrinking if the length of conversion region is less than this
5425    value.  */
5426 static int shrink_conversion_region_threshhold = 1024;
5427
5428 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5429   do {                                                                  \
5430     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5431       {                                                                 \
5432         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5433         else shrink_decoding_region (beg, end, coding, str);            \
5434       }                                                                 \
5435   } while (0)
5436
5437 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5438    Vlast_coding_system_used and the remaining elements are buffers to
5439    kill.  */
5440 static Lisp_Object
5441 code_convert_region_unwind (arg)
5442      Lisp_Object arg;
5443 {
5444   struct gcpro gcpro1;
5445   GCPRO1 (arg);
5446
5447   inhibit_pre_post_conversion = 0;
5448   Vlast_coding_system_used = XCAR (arg);
5449   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5450     Fkill_buffer (XCAR (arg));
5451
5452   UNGCPRO;
5453   return Qnil;
5454 }
5455
5456 /* Store information about all compositions in the range FROM and TO
5457    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5458    buffer or a string, defaults to the current buffer.  */
5459
5460 void
5461 coding_save_composition (coding, from, to, obj)
5462      struct coding_system *coding;
5463      int from, to;
5464      Lisp_Object obj;
5465 {
5466   Lisp_Object prop;
5467   int start, end;
5468
5469   if (coding->composing == COMPOSITION_DISABLED)
5470     return;
5471   if (!coding->cmp_data)
5472     coding_allocate_composition_data (coding, from);
5473   if (!find_composition (from, to, &start, &end, &prop, obj)
5474       || end > to)
5475     return;
5476   if (start < from
5477       && (!find_composition (end, to, &start, &end, &prop, obj)
5478           || end > to))
5479     return;
5480   coding->composing = COMPOSITION_NO;
5481   do
5482     {
5483       if (COMPOSITION_VALID_P (start, end, prop))
5484         {
5485           enum composition_method method = COMPOSITION_METHOD (prop);
5486           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5487               >= COMPOSITION_DATA_SIZE)
5488             coding_allocate_composition_data (coding, from);
5489           /* For relative composition, we remember start and end
5490              positions, for the other compositions, we also remember
5491              components.  */
5492           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5493           if (method != COMPOSITION_RELATIVE)
5494             {
5495               /* We must store a*/
5496               Lisp_Object val, ch;
5497
5498               val = COMPOSITION_COMPONENTS (prop);
5499               if (CONSP (val))
5500                 while (CONSP (val))
5501                   {
5502                     ch = XCAR (val), val = XCDR (val);
5503                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5504                   }
5505               else if (VECTORP (val) || STRINGP (val))
5506                 {
5507                   int len = (VECTORP (val)
5508                              ? XVECTOR (val)->size : SCHARS (val));
5509                   int i;
5510                   for (i = 0; i < len; i++)
5511                     {
5512                       ch = (STRINGP (val)
5513                             ? Faref (val, make_number (i))
5514                             : XVECTOR (val)->contents[i]);
5515                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5516                     }
5517                 }
5518               else              /* INTEGERP (val) */
5519                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5520             }
5521           CODING_ADD_COMPOSITION_END (coding, end - from);
5522         }
5523       start = end;
5524     }
5525   while (start < to
5526          && find_composition (start, to, &start, &end, &prop, obj)
5527          && end <= to);
5528
5529   /* Make coding->cmp_data point to the first memory block.  */
5530   while (coding->cmp_data->prev)
5531     coding->cmp_data = coding->cmp_data->prev;
5532   coding->cmp_data_start = 0;
5533 }
5534
5535 /* Reflect the saved information about compositions to OBJ.
5536    CODING->cmp_data points to a memory block for the information.  OBJ
5537    is a buffer or a string, defaults to the current buffer.  */
5538
5539 void
5540 coding_restore_composition (coding, obj)
5541      struct coding_system *coding;
5542      Lisp_Object obj;
5543 {
5544   struct composition_data *cmp_data = coding->cmp_data;
5545
5546   if (!cmp_data)
5547     return;
5548
5549   while (cmp_data->prev)
5550     cmp_data = cmp_data->prev;
5551
5552   while (cmp_data)
5553     {
5554       int i;
5555
5556       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5557            i += cmp_data->data[i])
5558         {
5559           int *data = cmp_data->data + i;
5560           enum composition_method method = (enum composition_method) data[3];
5561           Lisp_Object components;
5562
5563           if (data[0] < 0 || i + data[0] > cmp_data->used)
5564             /* Invalid composition data.  */
5565             break;
5566
5567           if (method == COMPOSITION_RELATIVE)
5568             components = Qnil;
5569           else
5570             {
5571               int len = data[0] - 4, j;
5572               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5573
5574               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5575                   && len % 2 == 0)
5576                 len --;
5577               if (len < 1)
5578                 /* Invalid composition data.  */
5579                 break;
5580               for (j = 0; j < len; j++)
5581                 args[j] = make_number (data[4 + j]);
5582               components = (method == COMPOSITION_WITH_ALTCHARS
5583                             ? Fstring (len, args)
5584                             : Fvector (len, args));
5585             }
5586           compose_text (data[1], data[2], components, Qnil, obj);
5587         }
5588       cmp_data = cmp_data->next;
5589     }
5590 }
5591
5592 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5593    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5594    coding system CODING, and return the status code of code conversion
5595    (currently, this value has no meaning).
5596
5597    How many characters (and bytes) are converted to how many
5598    characters (and bytes) are recorded in members of the structure
5599    CODING.
5600
5601    If REPLACE is nonzero, we do various things as if the original text
5602    is deleted and a new text is inserted.  See the comments in
5603    replace_range (insdel.c) to know what we are doing.
5604
5605    If REPLACE is zero, it is assumed that the source text is unibyte.
5606    Otherwise, it is assumed that the source text is multibyte.  */
5607
5608 int
5609 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5610      int from, from_byte, to, to_byte, encodep, replace;
5611      struct coding_system *coding;
5612 {
5613   int len = to - from, len_byte = to_byte - from_byte;
5614   int nchars_del = 0, nbytes_del = 0;
5615   int require, inserted, inserted_byte;
5616   int head_skip, tail_skip, total_skip = 0;
5617   Lisp_Object saved_coding_symbol;
5618   int first = 1;
5619   unsigned char *src, *dst;
5620   Lisp_Object deletion;
5621   int orig_point = PT, orig_len = len;
5622   int prev_Z;
5623   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5624
5625   deletion = Qnil;
5626   saved_coding_symbol = coding->symbol;
5627
5628   if (from < PT && PT < to)
5629     {
5630       TEMP_SET_PT_BOTH (from, from_byte);
5631       orig_point = from;
5632     }
5633
5634   if (replace)
5635     {
5636       int saved_from = from;
5637       int saved_inhibit_modification_hooks;
5638
5639       prepare_to_modify_buffer (from, to, &from);
5640       if (saved_from != from)
5641         {
5642           to = from + len;
5643           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5644           len_byte = to_byte - from_byte;
5645         }
5646
5647       /* The code conversion routine can not preserve text properties
5648          for now.  So, we must remove all text properties in the
5649          region.  Here, we must suppress all modification hooks.  */
5650       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5651       inhibit_modification_hooks = 1;
5652       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5653       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5654     }
5655
5656   coding->heading_ascii = 0;
5657
5658   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5659     {
5660       /* We must detect encoding of text and eol format.  */
5661
5662       if (from < GPT && to > GPT)
5663         move_gap_both (from, from_byte);
5664       if (coding->type == coding_type_undecided)
5665         {
5666           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5667           if (coding->type == coding_type_undecided)
5668             {
5669               /* It seems that the text contains only ASCII, but we
5670                  should not leave it undecided because the deeper
5671                  decoding routine (decode_coding) tries to detect the
5672                  encodings again in vain.  */
5673               coding->type = coding_type_emacs_mule;
5674               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5675               /* As emacs-mule decoder will handle composition, we
5676                  need this setting to allocate coding->cmp_data
5677                  later.  */
5678               coding->composing = COMPOSITION_NO;
5679             }
5680         }
5681       if (coding->eol_type == CODING_EOL_UNDECIDED
5682           && coding->type != coding_type_ccl)
5683         {
5684           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5685           if (coding->eol_type == CODING_EOL_UNDECIDED)
5686             coding->eol_type = CODING_EOL_LF;
5687           /* We had better recover the original eol format if we
5688              encounter an inconsistent eol format while decoding.  */
5689           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5690         }
5691     }
5692
5693   /* Now we convert the text.  */
5694
5695   /* For encoding, we must process pre-write-conversion in advance.  */
5696   if (! inhibit_pre_post_conversion
5697       && encodep
5698       && SYMBOLP (coding->pre_write_conversion)
5699       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5700     {
5701       /* The function in pre-write-conversion may put a new text in a
5702          new buffer.  */
5703       struct buffer *prev = current_buffer;
5704       Lisp_Object new;
5705
5706       record_unwind_protect (code_convert_region_unwind,
5707                              Fcons (Vlast_coding_system_used, Qnil));
5708       /* We should not call any more pre-write/post-read-conversion
5709          functions while this pre-write-conversion is running.  */
5710       inhibit_pre_post_conversion = 1;
5711       call2 (coding->pre_write_conversion,
5712              make_number (from), make_number (to));
5713       inhibit_pre_post_conversion = 0;
5714       /* Discard the unwind protect.  */
5715       specpdl_ptr--;
5716
5717       if (current_buffer != prev)
5718         {
5719           len = ZV - BEGV;
5720           new = Fcurrent_buffer ();
5721           set_buffer_internal_1 (prev);
5722           del_range_2 (from, from_byte, to, to_byte, 0);
5723           TEMP_SET_PT_BOTH (from, from_byte);
5724           insert_from_buffer (XBUFFER (new), 1, len, 0);
5725           Fkill_buffer (new);
5726           if (orig_point >= to)
5727             orig_point += len - orig_len;
5728           else if (orig_point > from)
5729             orig_point = from;
5730           orig_len = len;
5731           to = from + len;
5732           from_byte = CHAR_TO_BYTE (from);
5733           to_byte = CHAR_TO_BYTE (to);
5734           len_byte = to_byte - from_byte;
5735           TEMP_SET_PT_BOTH (from, from_byte);
5736         }
5737     }
5738
5739   if (replace)
5740     {
5741       if (! EQ (current_buffer->undo_list, Qt))
5742         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5743       else
5744         {
5745           nchars_del = to - from;
5746           nbytes_del = to_byte - from_byte;
5747         }
5748     }
5749
5750   if (coding->composing != COMPOSITION_DISABLED)
5751     {
5752       if (encodep)
5753         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5754       else
5755         coding_allocate_composition_data (coding, from);
5756     }
5757
5758   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5759      if we must run CCL program or there are compositions to
5760      encode.  */
5761   if (coding->type != coding_type_ccl
5762       && (! coding->cmp_data || coding->cmp_data->used == 0))
5763     {
5764       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5765
5766       if (from < GPT && GPT < to)
5767         move_gap_both (from, from_byte);
5768       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5769       if (from_byte == to_byte
5770           && (encodep || NILP (coding->post_read_conversion))
5771           && ! CODING_REQUIRE_FLUSHING (coding))
5772         {
5773           coding->produced = len_byte;
5774           coding->produced_char = len;
5775           if (!replace)
5776             /* We must record and adjust for this new text now.  */
5777             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5778           coding_free_composition_data (coding);
5779           return 0;
5780         }
5781
5782       head_skip = from_byte - from_byte_orig;
5783       tail_skip = to_byte_orig - to_byte;
5784       total_skip = head_skip + tail_skip;
5785       from += head_skip;
5786       to -= tail_skip;
5787       len -= total_skip; len_byte -= total_skip;
5788     }
5789
5790   /* For conversion, we must put the gap before the text in addition to
5791      making the gap larger for efficient decoding.  The required gap
5792      size starts from 2000 which is the magic number used in make_gap.
5793      But, after one batch of conversion, it will be incremented if we
5794      find that it is not enough .  */
5795   require = 2000;
5796
5797   if (GAP_SIZE  < require)
5798     make_gap (require - GAP_SIZE);
5799   move_gap_both (from, from_byte);
5800
5801   inserted = inserted_byte = 0;
5802
5803   GAP_SIZE += len_byte;
5804   ZV -= len;
5805   Z -= len;
5806   ZV_BYTE -= len_byte;
5807   Z_BYTE -= len_byte;
5808
5809   if (GPT - BEG < BEG_UNCHANGED)
5810     BEG_UNCHANGED = GPT - BEG;
5811   if (Z - GPT < END_UNCHANGED)
5812     END_UNCHANGED = Z - GPT;
5813
5814   if (!encodep && coding->src_multibyte)
5815     {
5816       /* Decoding routines expects that the source text is unibyte.
5817          We must convert 8-bit characters of multibyte form to
5818          unibyte.  */
5819       int len_byte_orig = len_byte;
5820       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5821       if (len_byte < len_byte_orig)
5822         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5823                     len_byte);
5824       coding->src_multibyte = 0;
5825     }
5826
5827   for (;;)
5828     {
5829       int result;
5830
5831       /* The buffer memory is now:
5832          +--------+converted-text+---------+-------original-text-------+---+
5833          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5834                   |<---------------------- GAP ----------------------->|  */
5835       src = GAP_END_ADDR - len_byte;
5836       dst = GPT_ADDR + inserted_byte;
5837
5838       if (encodep)
5839         result = encode_coding (coding, src, dst, len_byte, 0);
5840       else
5841         {
5842           if (coding->composing != COMPOSITION_DISABLED)
5843             coding->cmp_data->char_offset = from + inserted;
5844           result = decode_coding (coding, src, dst, len_byte, 0);
5845         }
5846
5847       /* The buffer memory is now:
5848          +--------+-------converted-text----+--+------original-text----+---+
5849          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5850                   |<---------------------- GAP ----------------------->|  */
5851
5852       inserted += coding->produced_char;
5853       inserted_byte += coding->produced;
5854       len_byte -= coding->consumed;
5855
5856       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5857         {
5858           coding_allocate_composition_data (coding, from + inserted);
5859           continue;
5860         }
5861
5862       src += coding->consumed;
5863       dst += coding->produced;
5864
5865       if (result == CODING_FINISH_NORMAL)
5866         {
5867           src += len_byte;
5868           break;
5869         }
5870       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5871         {
5872           unsigned char *pend = dst, *p = pend - inserted_byte;
5873           Lisp_Object eol_type;
5874
5875           /* Encode LFs back to the original eol format (CR or CRLF).  */
5876           if (coding->eol_type == CODING_EOL_CR)
5877             {
5878               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5879             }
5880           else
5881             {
5882               int count = 0;
5883
5884               while (p < pend) if (*p++ == '\n') count++;
5885               if (src - dst < count)
5886                 {
5887                   /* We don't have sufficient room for encoding LFs
5888                      back to CRLF.  We must record converted and
5889                      not-yet-converted text back to the buffer
5890                      content, enlarge the gap, then record them out of
5891                      the buffer contents again.  */
5892                   int add = len_byte + inserted_byte;
5893
5894                   GAP_SIZE -= add;
5895                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5896                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5897                   make_gap (count - GAP_SIZE);
5898                   GAP_SIZE += add;
5899                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5900                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5901                   /* Don't forget to update SRC, DST, and PEND.  */
5902                   src = GAP_END_ADDR - len_byte;
5903                   dst = GPT_ADDR + inserted_byte;
5904                   pend = dst;
5905                 }
5906               inserted += count;
5907               inserted_byte += count;
5908               coding->produced += count;
5909               p = dst = pend + count;
5910               while (count)
5911                 {
5912                   *--p = *--pend;
5913                   if (*p == '\n') count--, *--p = '\r';
5914                 }
5915             }
5916
5917           /* Suppress eol-format conversion in the further conversion.  */
5918           coding->eol_type = CODING_EOL_LF;
5919
5920           /* Set the coding system symbol to that for Unix-like EOL.  */
5921           eol_type = Fget (saved_coding_symbol, Qeol_type);
5922           if (VECTORP (eol_type)
5923               && XVECTOR (eol_type)->size == 3
5924               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5925             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5926           else
5927             coding->symbol = saved_coding_symbol;
5928
5929           continue;
5930         }
5931       if (len_byte <= 0)
5932         {
5933           if (coding->type != coding_type_ccl
5934               || coding->mode & CODING_MODE_LAST_BLOCK)
5935             break;
5936           coding->mode |= CODING_MODE_LAST_BLOCK;
5937           continue;
5938         }
5939       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5940         {
5941           /* The source text ends in invalid codes.  Let's just
5942              make them valid buffer contents, and finish conversion.  */
5943           if (multibyte_p)
5944             {
5945               unsigned char *start = dst;
5946
5947               inserted += len_byte;
5948               while (len_byte--)
5949                 {
5950                   int c = *src++;
5951                   dst += CHAR_STRING (c, dst);
5952                 }
5953
5954               inserted_byte += dst - start;
5955             }
5956           else
5957             {
5958               inserted += len_byte;
5959               inserted_byte += len_byte;
5960               while (len_byte--)
5961                 *dst++ = *src++;
5962             }
5963           break;
5964         }
5965       if (result == CODING_FINISH_INTERRUPT)
5966         {
5967           /* The conversion procedure was interrupted by a user.  */
5968           break;
5969         }
5970       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5971       if (coding->consumed < 1)
5972         {
5973           /* It's quite strange to require more memory without
5974              consuming any bytes.  Perhaps CCL program bug.  */
5975           break;
5976         }
5977       if (first)
5978         {
5979           /* We have just done the first batch of conversion which was
5980              stopped because of insufficient gap.  Let's reconsider the
5981              required gap size (i.e. SRT - DST) now.
5982
5983              We have converted ORIG bytes (== coding->consumed) into
5984              NEW bytes (coding->produced).  To convert the remaining
5985              LEN bytes, we may need REQUIRE bytes of gap, where:
5986                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5987                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5988              Here, we are sure that NEW >= ORIG.  */
5989
5990           if (coding->produced <= coding->consumed)
5991             {
5992               /* This happens because of CCL-based coding system with
5993                  eol-type CRLF.  */
5994               require = 0;
5995             }
5996           else
5997             {
5998               float ratio = coding->produced - coding->consumed;
5999               ratio /= coding->consumed;
6000               require = len_byte * ratio;
6001             }
6002           first = 0;
6003         }
6004       if ((src - dst) < (require + 2000))
6005         {
6006           /* See the comment above the previous call of make_gap.  */
6007           int add = len_byte + inserted_byte;
6008
6009           GAP_SIZE -= add;
6010           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
6011           GPT += inserted_byte; GPT_BYTE += inserted_byte;
6012           make_gap (require + 2000);
6013           GAP_SIZE += add;
6014           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
6015           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
6016         }
6017     }
6018   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
6019
6020   if (encodep && coding->dst_multibyte)
6021     {
6022       /* The output is unibyte.  We must convert 8-bit characters to
6023          multibyte form.  */
6024       if (inserted_byte * 2 > GAP_SIZE)
6025         {
6026           GAP_SIZE -= inserted_byte;
6027           ZV += inserted_byte; Z += inserted_byte;
6028           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
6029           GPT += inserted_byte; GPT_BYTE += inserted_byte;
6030           make_gap (inserted_byte - GAP_SIZE);
6031           GAP_SIZE += inserted_byte;
6032           ZV -= inserted_byte; Z -= inserted_byte;
6033           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
6034           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
6035         }
6036       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
6037     }
6038
6039   /* If we shrank the conversion area, adjust it now.  */
6040   if (total_skip > 0)
6041     {
6042       if (tail_skip > 0)
6043         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
6044       inserted += total_skip; inserted_byte += total_skip;
6045       GAP_SIZE += total_skip;
6046       GPT -= head_skip; GPT_BYTE -= head_skip;
6047       ZV -= total_skip; ZV_BYTE -= total_skip;
6048       Z -= total_skip; Z_BYTE -= total_skip;
6049       from -= head_skip; from_byte -= head_skip;
6050       to += tail_skip; to_byte += tail_skip;
6051     }
6052
6053   prev_Z = Z;
6054   if (! EQ (current_buffer->undo_list, Qt))
6055     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6056   else
6057     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
6058                                  inserted, inserted_byte);
6059   inserted = Z - prev_Z;
6060
6061   if (!encodep && coding->cmp_data && coding->cmp_data->used)
6062     coding_restore_composition (coding, Fcurrent_buffer ());
6063   coding_free_composition_data (coding);
6064
6065   if (! inhibit_pre_post_conversion
6066       && ! encodep && ! NILP (coding->post_read_conversion))
6067     {
6068       Lisp_Object val;
6069       Lisp_Object saved_coding_system;
6070
6071       if (from != PT)
6072         TEMP_SET_PT_BOTH (from, from_byte);
6073       prev_Z = Z;
6074       record_unwind_protect (code_convert_region_unwind,
6075                              Fcons (Vlast_coding_system_used, Qnil));
6076       saved_coding_system = Vlast_coding_system_used;
6077       Vlast_coding_system_used = coding->symbol;
6078       /* We should not call any more pre-write/post-read-conversion
6079          functions while this post-read-conversion is running.  */
6080       inhibit_pre_post_conversion = 1;
6081       val = call1 (coding->post_read_conversion, make_number (inserted));
6082       inhibit_pre_post_conversion = 0;
6083       coding->symbol = Vlast_coding_system_used;
6084       Vlast_coding_system_used = saved_coding_system;
6085       /* Discard the unwind protect.  */
6086       specpdl_ptr--;
6087       CHECK_NUMBER (val);
6088       inserted += Z - prev_Z;
6089     }
6090
6091   if (orig_point >= from)
6092     {
6093       if (orig_point >= from + orig_len)
6094         orig_point += inserted - orig_len;
6095       else
6096         orig_point = from;
6097       TEMP_SET_PT (orig_point);
6098     }
6099
6100   if (replace)
6101     {
6102       signal_after_change (from, to - from, inserted);
6103       update_compositions (from, from + inserted, CHECK_BORDER);
6104     }
6105
6106   {
6107     coding->consumed = to_byte - from_byte;
6108     coding->consumed_char = to - from;
6109     coding->produced = inserted_byte;
6110     coding->produced_char = inserted;
6111   }
6112
6113   return 0;
6114 }
6115
6116 /* Name (or base name) of work buffer for code conversion.  */
6117 static Lisp_Object Vcode_conversion_workbuf_name;
6118
6119 /* Set the current buffer to the working buffer prepared for
6120    code-conversion.  MULTIBYTE specifies the multibyteness of the
6121    buffer.  Return the buffer we set if it must be killed after use.
6122    Otherwise return Qnil.  */
6123
6124 static Lisp_Object
6125 set_conversion_work_buffer (multibyte)
6126      int multibyte;
6127 {
6128   Lisp_Object buffer, buffer_to_kill;
6129   struct buffer *buf;
6130
6131   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6132   buf = XBUFFER (buffer);
6133   if (buf == current_buffer)
6134     {
6135       /* As we are already in the work buffer, we must generate a new
6136          buffer for the work.  */
6137       Lisp_Object name;
6138
6139       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6140       buffer = buffer_to_kill = Fget_buffer_create (name);
6141       buf = XBUFFER (buffer);
6142     }
6143   else
6144     buffer_to_kill = Qnil;
6145
6146   delete_all_overlays (buf);
6147   buf->directory = current_buffer->directory;
6148   buf->read_only = Qnil;
6149   buf->filename = Qnil;
6150   buf->undo_list = Qt;
6151   eassert (buf->overlays_before == NULL);
6152   eassert (buf->overlays_after == NULL);
6153   set_buffer_internal (buf);
6154   if (BEG != BEGV || Z != ZV)
6155     Fwiden ();
6156   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6157   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6158   return buffer_to_kill;
6159 }
6160
6161 Lisp_Object
6162 run_pre_post_conversion_on_str (str, coding, encodep)
6163      Lisp_Object str;
6164      struct coding_system *coding;
6165      int encodep;
6166 {
6167   int count = SPECPDL_INDEX ();
6168   struct gcpro gcpro1, gcpro2;
6169   int multibyte = STRING_MULTIBYTE (str);
6170   Lisp_Object old_deactivate_mark;
6171   Lisp_Object buffer_to_kill;
6172   Lisp_Object unwind_arg;
6173
6174   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6175   /* It is not crucial to specbind this.  */
6176   old_deactivate_mark = Vdeactivate_mark;
6177   GCPRO2 (str, old_deactivate_mark);
6178
6179   /* We must insert the contents of STR as is without
6180      unibyte<->multibyte conversion.  For that, we adjust the
6181      multibyteness of the working buffer to that of STR.  */
6182   buffer_to_kill = set_conversion_work_buffer (multibyte);
6183   if (NILP (buffer_to_kill))
6184     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6185   else
6186     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6187   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6188
6189   insert_from_string (str, 0, 0,
6190                       SCHARS (str), SBYTES (str), 0);
6191   UNGCPRO;
6192   inhibit_pre_post_conversion = 1;
6193   if (encodep)
6194     {
6195       struct buffer *prev = current_buffer;
6196
6197       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6198       if (prev != current_buffer)
6199         /* We must kill the current buffer too.  */
6200         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6201     }
6202   else
6203     {
6204       Vlast_coding_system_used = coding->symbol;
6205       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6206       call1 (coding->post_read_conversion, make_number (Z - BEG));
6207       coding->symbol = Vlast_coding_system_used;
6208     }
6209   inhibit_pre_post_conversion = 0;
6210   Vdeactivate_mark = old_deactivate_mark;
6211   str = make_buffer_string (BEG, Z, 1);
6212   return unbind_to (count, str);
6213 }
6214
6215
6216 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6217    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6218    is intended that this function is called from encode_terminal_code,
6219    the pre-write-conversion function is run by safe_call and thus
6220    "Error during redisplay: ..." is logged when an error occurs.
6221
6222    Store the resulting text in *STR and set CODING->produced_char and
6223    CODING->produced to the number of characters and bytes
6224    respectively.  If the size of *STR is too small, enlarge it by
6225    xrealloc and update *STR and *SIZE.  */
6226
6227 void
6228 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6229      unsigned char **str;
6230      int *size, nchars, nbytes;
6231      struct coding_system *coding;
6232 {
6233   struct gcpro gcpro1, gcpro2;
6234   struct buffer *cur = current_buffer;
6235   struct buffer *prev;
6236   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6237   Lisp_Object args[3];
6238   Lisp_Object buffer_to_kill;
6239
6240   /* It is not crucial to specbind this.  */
6241   old_deactivate_mark = Vdeactivate_mark;
6242   old_last_coding_system_used = Vlast_coding_system_used;
6243   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6244
6245   /* We must insert the contents of STR as is without
6246      unibyte<->multibyte conversion.  For that, we adjust the
6247      multibyteness of the working buffer to that of STR.  */
6248   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6249   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6250   UNGCPRO;
6251   inhibit_pre_post_conversion = 1;
6252   prev = current_buffer;
6253   args[0] = coding->pre_write_conversion;
6254   args[1] = make_number (BEG);
6255   args[2] = make_number (Z);
6256   safe_call (3, args);
6257   inhibit_pre_post_conversion = 0;
6258   Vdeactivate_mark = old_deactivate_mark;
6259   Vlast_coding_system_used = old_last_coding_system_used;
6260   coding->produced_char = Z - BEG;
6261   coding->produced = Z_BYTE - BEG_BYTE;
6262   if (coding->produced > *size)
6263     {
6264       *size = coding->produced;
6265       *str = xrealloc (*str, *size);
6266     }
6267   if (BEG < GPT && GPT < Z)
6268     move_gap (BEG);
6269   bcopy (BEG_ADDR, *str, coding->produced);
6270   coding->src_multibyte
6271     = ! NILP (current_buffer->enable_multibyte_characters);
6272   if (prev != current_buffer)
6273     Fkill_buffer (Fcurrent_buffer ());
6274   set_buffer_internal (cur);
6275   if (! NILP (buffer_to_kill))
6276     Fkill_buffer (buffer_to_kill);
6277 }
6278
6279
6280 Lisp_Object
6281 decode_coding_string (str, coding, nocopy)
6282      Lisp_Object str;
6283      struct coding_system *coding;
6284      int nocopy;
6285 {
6286   int len;
6287   struct conversion_buffer buf;
6288   int from, to_byte;
6289   Lisp_Object saved_coding_symbol;
6290   int result;
6291   int require_decoding;
6292   int shrinked_bytes = 0;
6293   Lisp_Object newstr;
6294   int consumed, consumed_char, produced, produced_char;
6295
6296   from = 0;
6297   to_byte = SBYTES (str);
6298
6299   saved_coding_symbol = coding->symbol;
6300   coding->src_multibyte = STRING_MULTIBYTE (str);
6301   coding->dst_multibyte = 1;
6302   coding->heading_ascii = 0;
6303
6304   if (CODING_REQUIRE_DETECTION (coding))
6305     {
6306       /* See the comments in code_convert_region.  */
6307       if (coding->type == coding_type_undecided)
6308         {
6309           detect_coding (coding, SDATA (str), to_byte);
6310           if (coding->type == coding_type_undecided)
6311             {
6312               coding->type = coding_type_emacs_mule;
6313               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6314               /* As emacs-mule decoder will handle composition, we
6315                  need this setting to allocate coding->cmp_data
6316                  later.  */
6317               coding->composing = COMPOSITION_NO;
6318             }
6319         }
6320       if (coding->eol_type == CODING_EOL_UNDECIDED
6321           && coding->type != coding_type_ccl)
6322         {
6323           saved_coding_symbol = coding->symbol;
6324           detect_eol (coding, SDATA (str), to_byte);
6325           if (coding->eol_type == CODING_EOL_UNDECIDED)
6326             coding->eol_type = CODING_EOL_LF;
6327           /* We had better recover the original eol format if we
6328              encounter an inconsistent eol format while decoding.  */
6329           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6330         }
6331     }
6332
6333   if (coding->type == coding_type_no_conversion
6334       || coding->type == coding_type_raw_text)
6335     coding->dst_multibyte = 0;
6336
6337   require_decoding = CODING_REQUIRE_DECODING (coding);
6338
6339   if (STRING_MULTIBYTE (str))
6340     {
6341       /* Decoding routines expect the source text to be unibyte.  */
6342       str = Fstring_as_unibyte (str);
6343       to_byte = SBYTES (str);
6344       nocopy = 1;
6345       coding->src_multibyte = 0;
6346     }
6347
6348   /* Try to skip the heading and tailing ASCIIs.  */
6349   if (require_decoding && coding->type != coding_type_ccl)
6350     {
6351       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6352                                 0);
6353       if (from == to_byte)
6354         require_decoding = 0;
6355       shrinked_bytes = from + (SBYTES (str) - to_byte);
6356     }
6357
6358   if (!require_decoding
6359       && !(SYMBOLP (coding->post_read_conversion)
6360            && !NILP (Ffboundp (coding->post_read_conversion))))
6361     {
6362       coding->consumed = SBYTES (str);
6363       coding->consumed_char = SCHARS (str);
6364       if (coding->dst_multibyte)
6365         {
6366           str = Fstring_as_multibyte (str);
6367           nocopy = 1;
6368         }
6369       coding->produced = SBYTES (str);
6370       coding->produced_char = SCHARS (str);
6371       return (nocopy ? str : Fcopy_sequence (str));
6372     }
6373
6374   if (coding->composing != COMPOSITION_DISABLED)
6375     coding_allocate_composition_data (coding, from);
6376   len = decoding_buffer_size (coding, to_byte - from);
6377   allocate_conversion_buffer (buf, len);
6378
6379   consumed = consumed_char = produced = produced_char = 0;
6380   while (1)
6381     {
6382       result = decode_coding (coding, SDATA (str) + from + consumed,
6383                               buf.data + produced, to_byte - from - consumed,
6384                               buf.size - produced);
6385       consumed += coding->consumed;
6386       consumed_char += coding->consumed_char;
6387       produced += coding->produced;
6388       produced_char += coding->produced_char;
6389       if (result == CODING_FINISH_NORMAL
6390           || result == CODING_FINISH_INTERRUPT
6391           || (result == CODING_FINISH_INSUFFICIENT_SRC
6392               && coding->consumed == 0))
6393         break;
6394       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6395         coding_allocate_composition_data (coding, from + produced_char);
6396       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6397         extend_conversion_buffer (&buf);
6398       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6399         {
6400           Lisp_Object eol_type;
6401
6402           /* Recover the original EOL format.  */
6403           if (coding->eol_type == CODING_EOL_CR)
6404             {
6405               unsigned char *p;
6406               for (p = buf.data; p < buf.data + produced; p++)
6407                 if (*p == '\n') *p = '\r';
6408             }
6409           else if (coding->eol_type == CODING_EOL_CRLF)
6410             {
6411               int num_eol = 0;
6412               unsigned char *p0, *p1;
6413               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6414                 if (*p0 == '\n') num_eol++;
6415               if (produced + num_eol >= buf.size)
6416                 extend_conversion_buffer (&buf);
6417               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6418                 {
6419                   *--p1 = *--p0;
6420                   if (*p0 == '\n') *--p1 = '\r';
6421                 }
6422               produced += num_eol;
6423               produced_char += num_eol;
6424             }
6425           /* Suppress eol-format conversion in the further conversion.  */
6426           coding->eol_type = CODING_EOL_LF;
6427
6428           /* Set the coding system symbol to that for Unix-like EOL.  */
6429           eol_type = Fget (saved_coding_symbol, Qeol_type);
6430           if (VECTORP (eol_type)
6431               && XVECTOR (eol_type)->size == 3
6432               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6433             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6434           else
6435             coding->symbol = saved_coding_symbol;
6436
6437
6438         }
6439     }
6440
6441   coding->consumed = consumed;
6442   coding->consumed_char = consumed_char;
6443   coding->produced = produced;
6444   coding->produced_char = produced_char;
6445
6446   if (coding->dst_multibyte)
6447     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6448                                            produced + shrinked_bytes);
6449   else
6450     newstr = make_uninit_string (produced + shrinked_bytes);
6451   if (from > 0)
6452     STRING_COPYIN (newstr, 0, SDATA (str), from);
6453   STRING_COPYIN (newstr, from, buf.data, produced);
6454   if (shrinked_bytes > from)
6455     STRING_COPYIN (newstr, from + produced,
6456                    SDATA (str) + to_byte,
6457                    shrinked_bytes - from);
6458   free_conversion_buffer (&buf);
6459
6460   coding->consumed += shrinked_bytes;
6461   coding->consumed_char += shrinked_bytes;
6462   coding->produced += shrinked_bytes;
6463   coding->produced_char += shrinked_bytes;
6464
6465   if (coding->cmp_data && coding->cmp_data->used)
6466     coding_restore_composition (coding, newstr);
6467   coding_free_composition_data (coding);
6468
6469   if (SYMBOLP (coding->post_read_conversion)
6470       && !NILP (Ffboundp (coding->post_read_conversion)))
6471     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6472
6473   return newstr;
6474 }
6475
6476 Lisp_Object
6477 encode_coding_string (str, coding, nocopy)
6478      Lisp_Object str;
6479      struct coding_system *coding;
6480      int nocopy;
6481 {
6482   int len;
6483   struct conversion_buffer buf;
6484   int from, to, to_byte;
6485   int result;
6486   int shrinked_bytes = 0;
6487   Lisp_Object newstr;
6488   int consumed, consumed_char, produced, produced_char;
6489
6490   if (SYMBOLP (coding->pre_write_conversion)
6491       && !NILP (Ffboundp (coding->pre_write_conversion)))
6492     {
6493       str = run_pre_post_conversion_on_str (str, coding, 1);
6494       /* As STR is just newly generated, we don't have to copy it
6495          anymore.  */
6496       nocopy = 1;
6497     }
6498
6499   from = 0;
6500   to = SCHARS (str);
6501   to_byte = SBYTES (str);
6502
6503   /* Encoding routines determine the multibyteness of the source text
6504      by coding->src_multibyte.  */
6505   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6506   coding->dst_multibyte = 0;
6507   if (! CODING_REQUIRE_ENCODING (coding))
6508     goto no_need_of_encoding;
6509
6510   if (coding->composing != COMPOSITION_DISABLED)
6511     coding_save_composition (coding, from, to, str);
6512
6513   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6514      if we must run CCL program or there are compositions to
6515      encode.  */
6516   coding->heading_ascii = 0;
6517   if (coding->type != coding_type_ccl
6518       && (! coding->cmp_data || coding->cmp_data->used == 0))
6519     {
6520       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6521                                 1);
6522       if (from == to_byte)
6523         {
6524           coding_free_composition_data (coding);
6525           goto no_need_of_encoding;
6526         }
6527       shrinked_bytes = from + (SBYTES (str) - to_byte);
6528     }
6529
6530   len = encoding_buffer_size (coding, to_byte - from);
6531   allocate_conversion_buffer (buf, len);
6532
6533   consumed = consumed_char = produced = produced_char = 0;
6534   while (1)
6535     {
6536       result = encode_coding (coding, SDATA (str) + from + consumed,
6537                               buf.data + produced, to_byte - from - consumed,
6538                               buf.size - produced);
6539       consumed += coding->consumed;
6540       consumed_char += coding->consumed_char;
6541       produced += coding->produced;
6542       produced_char += coding->produced_char;
6543       if (result == CODING_FINISH_NORMAL
6544           || result == CODING_FINISH_INTERRUPT
6545           || (result == CODING_FINISH_INSUFFICIENT_SRC
6546               && coding->consumed == 0))
6547         break;
6548       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6549       extend_conversion_buffer (&buf);
6550     }
6551
6552   coding->consumed = consumed;
6553   coding->consumed_char = consumed_char;
6554   coding->produced = produced;
6555   coding->produced_char = produced_char;
6556
6557   newstr = make_uninit_string (produced + shrinked_bytes);
6558   if (from > 0)
6559     STRING_COPYIN (newstr, 0, SDATA (str), from);
6560   STRING_COPYIN (newstr, from, buf.data, produced);
6561   if (shrinked_bytes > from)
6562     STRING_COPYIN (newstr, from + produced,
6563                    SDATA (str) + to_byte,
6564                    shrinked_bytes - from);
6565
6566   free_conversion_buffer (&buf);
6567   coding_free_composition_data (coding);
6568
6569   return newstr;
6570
6571  no_need_of_encoding:
6572   coding->consumed = SBYTES (str);
6573   coding->consumed_char = SCHARS (str);
6574   if (STRING_MULTIBYTE (str))
6575     {
6576       if (nocopy)
6577         /* We are sure that STR doesn't contain a multibyte
6578            character.  */
6579         STRING_SET_UNIBYTE (str);
6580       else
6581         {
6582           str = Fstring_as_unibyte (str);
6583           nocopy = 1;
6584         }
6585     }
6586   coding->produced = SBYTES (str);
6587   coding->produced_char = SCHARS (str);
6588   return (nocopy ? str : Fcopy_sequence (str));
6589 }
6590
6591 \f
6592 #ifdef emacs
6593 /*** 8. Emacs Lisp library functions ***/
6594
6595 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6596        doc: /* Return t if OBJECT is nil or a coding-system.
6597 See the documentation of `make-coding-system' for information
6598 about coding-system objects.  */)
6599      (obj)
6600      Lisp_Object obj;
6601 {
6602   if (NILP (obj))
6603     return Qt;
6604   if (!SYMBOLP (obj))
6605     return Qnil;
6606   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6607     return Qt;
6608   /* Get coding-spec vector for OBJ.  */
6609   obj = Fget (obj, Qcoding_system);
6610   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6611           ? Qt : Qnil);
6612 }
6613
6614 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6615        Sread_non_nil_coding_system, 1, 1, 0,
6616        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6617      (prompt)
6618      Lisp_Object prompt;
6619 {
6620   Lisp_Object val;
6621   do
6622     {
6623       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6624                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6625     }
6626   while (SCHARS (val) == 0);
6627   return (Fintern (val, Qnil));
6628 }
6629
6630 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6631        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6632 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
6633 Ignores case when completing coding systems (all Emacs coding systems
6634 are lower-case).  */)
6635      (prompt, default_coding_system)
6636      Lisp_Object prompt, default_coding_system;
6637 {
6638   Lisp_Object val;
6639   int count = SPECPDL_INDEX ();
6640
6641   if (SYMBOLP (default_coding_system))
6642     default_coding_system = SYMBOL_NAME (default_coding_system);
6643   specbind (Qcompletion_ignore_case, Qt);
6644   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6645                           Qt, Qnil, Qcoding_system_history,
6646                           default_coding_system, Qnil);
6647   unbind_to (count, Qnil);
6648   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6649 }
6650
6651 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6652        1, 1, 0,
6653        doc: /* Check validity of CODING-SYSTEM.
6654 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6655 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6656 The value of this property should be a vector of length 5.  */)
6657      (coding_system)
6658      Lisp_Object coding_system;
6659 {
6660   Lisp_Object define_form;
6661
6662   define_form = Fget (coding_system, Qcoding_system_define_form);
6663   if (! NILP (define_form))
6664     {
6665       Fput (coding_system, Qcoding_system_define_form, Qnil);
6666       safe_eval (define_form);
6667     }
6668   if (!NILP (Fcoding_system_p (coding_system)))
6669     return coding_system;
6670   xsignal1 (Qcoding_system_error, coding_system);
6671 }
6672 \f
6673 Lisp_Object
6674 detect_coding_system (src, src_bytes, highest, multibytep)
6675      const unsigned char *src;
6676      int src_bytes, highest;
6677      int multibytep;
6678 {
6679   int coding_mask, eol_type;
6680   Lisp_Object val, tmp;
6681   int dummy;
6682
6683   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6684   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6685   if (eol_type == CODING_EOL_INCONSISTENT)
6686     eol_type = CODING_EOL_UNDECIDED;
6687
6688   if (!coding_mask)
6689     {
6690       val = Qundecided;
6691       if (eol_type != CODING_EOL_UNDECIDED)
6692         {
6693           Lisp_Object val2;
6694           val2 = Fget (Qundecided, Qeol_type);
6695           if (VECTORP (val2))
6696             val = XVECTOR (val2)->contents[eol_type];
6697         }
6698       return (highest ? val : Fcons (val, Qnil));
6699     }
6700
6701   /* At first, gather possible coding systems in VAL.  */
6702   val = Qnil;
6703   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6704     {
6705       Lisp_Object category_val, category_index;
6706
6707       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6708       category_val = Fsymbol_value (XCAR (tmp));
6709       if (!NILP (category_val)
6710           && NATNUMP (category_index)
6711           && (coding_mask & (1 << XFASTINT (category_index))))
6712         {
6713           val = Fcons (category_val, val);
6714           if (highest)
6715             break;
6716         }
6717     }
6718   if (!highest)
6719     val = Fnreverse (val);
6720
6721   /* Then, replace the elements with subsidiary coding systems.  */
6722   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6723     {
6724       if (eol_type != CODING_EOL_UNDECIDED
6725           && eol_type != CODING_EOL_INCONSISTENT)
6726         {
6727           Lisp_Object eol;
6728           eol = Fget (XCAR (tmp), Qeol_type);
6729           if (VECTORP (eol))
6730             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6731         }
6732     }
6733   return (highest ? XCAR (val) : val);
6734 }
6735
6736 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6737        2, 3, 0,
6738        doc: /* Detect how the byte sequence in the region is encoded.
6739 Return a list of possible coding systems used on decoding a byte
6740 sequence containing the bytes in the region between START and END when
6741 the coding system `undecided' is specified.  The list is ordered by
6742 priority decided in the current language environment.
6743
6744 If only ASCII characters are found (except for such ISO-2022 control
6745 characters ISO-2022 as ESC), it returns a list of single element
6746 `undecided' or its subsidiary coding system according to a detected
6747 end-of-line format.
6748
6749 If optional argument HIGHEST is non-nil, return the coding system of
6750 highest priority.  */)
6751      (start, end, highest)
6752      Lisp_Object start, end, highest;
6753 {
6754   int from, to;
6755   int from_byte, to_byte;
6756   int include_anchor_byte = 0;
6757
6758   CHECK_NUMBER_COERCE_MARKER (start);
6759   CHECK_NUMBER_COERCE_MARKER (end);
6760
6761   validate_region (&start, &end);
6762   from = XINT (start), to = XINT (end);
6763   from_byte = CHAR_TO_BYTE (from);
6764   to_byte = CHAR_TO_BYTE (to);
6765
6766   if (from < GPT && to >= GPT)
6767     move_gap_both (to, to_byte);
6768   /* If we an anchor byte `\0' follows the region, we include it in
6769      the detecting source.  Then code detectors can handle the tailing
6770      byte sequence more accurately.
6771
6772      Fix me: This is not a perfect solution.  It is better that we
6773      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6774   */
6775   if (to == Z || (to == GPT && GAP_SIZE > 0))
6776     include_anchor_byte = 1;
6777   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6778                                to_byte - from_byte + include_anchor_byte,
6779                                !NILP (highest),
6780                                !NILP (current_buffer
6781                                       ->enable_multibyte_characters));
6782 }
6783
6784 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6785        1, 2, 0,
6786        doc: /* Detect how the byte sequence in STRING is encoded.
6787 Return a list of possible coding systems used on decoding a byte
6788 sequence containing the bytes in STRING when the coding system
6789 `undecided' is specified.  The list is ordered by priority decided in
6790 the current language environment.
6791
6792 If only ASCII characters are found (except for such ISO-2022 control
6793 characters ISO-2022 as ESC), it returns a list of single element
6794 `undecided' or its subsidiary coding system according to a detected
6795 end-of-line format.
6796
6797 If optional argument HIGHEST is non-nil, return the coding system of
6798 highest priority.  */)
6799      (string, highest)
6800      Lisp_Object string, highest;
6801 {
6802   CHECK_STRING (string);
6803
6804   return detect_coding_system (SDATA (string),
6805                                /* "+ 1" is to include the anchor byte
6806                                   `\0'.  With this, code detectors can
6807                                   handle the tailing bytes more
6808                                   accurately.  */
6809                                SBYTES (string) + 1,
6810                                !NILP (highest),
6811                                STRING_MULTIBYTE (string));
6812 }
6813
6814 /*  Subroutine for Ffind_coding_systems_region_internal.
6815
6816     Return a list of coding systems that safely encode the multibyte
6817     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6818     possible coding systems.  If it is nil, it means that we have not
6819     yet found any coding systems.
6820
6821     WORK_TABLE a char-table of which element is set to t once the
6822     element is looked up.
6823
6824     If a non-ASCII single byte char is found, set
6825     *single_byte_char_found to 1.  */
6826
6827 static Lisp_Object
6828 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6829      unsigned char *p, *pend;
6830      Lisp_Object safe_codings, work_table;
6831      int *single_byte_char_found;
6832 {
6833   int c, len;
6834   Lisp_Object val, ch;
6835   Lisp_Object prev, tail;
6836
6837   if (NILP (safe_codings))
6838     goto done_safe_codings;
6839   while (p < pend)
6840     {
6841       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6842       p += len;
6843       if (ASCII_BYTE_P (c))
6844         /* We can ignore ASCII characters here.  */
6845         continue;
6846       if (SINGLE_BYTE_CHAR_P (c))
6847         *single_byte_char_found = 1;
6848       /* Check the safe coding systems for C.  */
6849       ch = make_number (c);
6850       val = Faref (work_table, ch);
6851       if (EQ (val, Qt))
6852         /* This element was already checked.  Ignore it.  */
6853         continue;
6854       /* Remember that we checked this element.  */
6855       Faset (work_table, ch, Qt);
6856
6857       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6858         {
6859           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6860           int encodable;
6861
6862           elt = XCAR (tail);
6863           if (CONSP (XCDR (elt)))
6864             {
6865               /* This entry has this format now:
6866                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6867                           ACCEPT-LATIN-EXTRA ) */
6868               val = XCDR (elt);
6869               encodable = ! NILP (Faref (XCAR (val), ch));
6870               if (! encodable)
6871                 {
6872                   val = XCDR (val);
6873                   translation_table = XCAR (val);
6874                   hash_table = XCAR (XCDR (val));
6875                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6876                 }
6877             }
6878           else
6879             {
6880               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6881               encodable = ! NILP (Faref (XCDR (elt), ch));
6882               if (! encodable)
6883                 {
6884                   /* Transform the format to:
6885                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6886                        ACCEPT-LATIN-EXTRA )  */
6887                   val = Fget (XCAR (elt), Qcoding_system);
6888                   translation_table
6889                     = Fplist_get (AREF (val, 3),
6890                                   Qtranslation_table_for_encode);
6891                   if (SYMBOLP (translation_table))
6892                     translation_table = Fget (translation_table,
6893                                               Qtranslation_table);
6894                   hash_table
6895                     = (CHAR_TABLE_P (translation_table)
6896                        ? XCHAR_TABLE (translation_table)->extras[1]
6897                        : Qnil);
6898                   accept_latin_extra
6899                     = ((EQ (AREF (val, 0), make_number (2))
6900                         && VECTORP (AREF (val, 4)))
6901                        ? AREF (AREF (val, 4), 16)
6902                        : Qnil);
6903                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6904                                         translation_table, hash_table,
6905                                         accept_latin_extra));
6906                 }
6907             }
6908
6909           if (! encodable
6910               && ((CHAR_TABLE_P (translation_table)
6911                    && ! NILP (Faref (translation_table, ch)))
6912                   || (HASH_TABLE_P (hash_table)
6913                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6914                   || (SINGLE_BYTE_CHAR_P (c)
6915                       && ! NILP (accept_latin_extra)
6916                       && VECTORP (Vlatin_extra_code_table)
6917                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6918             encodable = 1;
6919           if (encodable)
6920             prev = tail;
6921           else
6922             {
6923               /* Exclude this coding system from SAFE_CODINGS.  */
6924               if (EQ (tail, safe_codings))
6925                 {
6926                   safe_codings = XCDR (safe_codings);
6927                   if (NILP (safe_codings))
6928                     goto done_safe_codings;
6929                 }
6930               else
6931                 XSETCDR (prev, XCDR (tail));
6932             }
6933         }
6934     }
6935
6936  done_safe_codings:
6937   /* If the above loop was terminated before P reaches PEND, it means
6938      SAFE_CODINGS was set to nil.  If we have not yet found an
6939      non-ASCII single-byte char, check it now.  */
6940   if (! *single_byte_char_found)
6941     while (p < pend)
6942       {
6943         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6944         p += len;
6945         if (! ASCII_BYTE_P (c)
6946             && SINGLE_BYTE_CHAR_P (c))
6947           {
6948             *single_byte_char_found = 1;
6949             break;
6950           }
6951       }
6952   return safe_codings;
6953 }
6954
6955 DEFUN ("find-coding-systems-region-internal",
6956        Ffind_coding_systems_region_internal,
6957        Sfind_coding_systems_region_internal, 2, 2, 0,
6958        doc: /* Internal use only.  */)
6959      (start, end)
6960      Lisp_Object start, end;
6961 {
6962   Lisp_Object work_table, safe_codings;
6963   int non_ascii_p = 0;
6964   int single_byte_char_found = 0;
6965   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6966
6967   if (STRINGP (start))
6968     {
6969       if (!STRING_MULTIBYTE (start))
6970         return Qt;
6971       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6972       p2 = p2end = p1end;
6973       if (SCHARS (start) != SBYTES (start))
6974         non_ascii_p = 1;
6975     }
6976   else
6977     {
6978       int from, to, stop;
6979
6980       CHECK_NUMBER_COERCE_MARKER (start);
6981       CHECK_NUMBER_COERCE_MARKER (end);
6982       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6983         args_out_of_range (start, end);
6984       if (NILP (current_buffer->enable_multibyte_characters))
6985         return Qt;
6986       from = CHAR_TO_BYTE (XINT (start));
6987       to = CHAR_TO_BYTE (XINT (end));
6988       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6989       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6990       if (stop == to)
6991         p2 = p2end = p1end;
6992       else
6993         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6994       if (XINT (end) - XINT (start) != to - from)
6995         non_ascii_p = 1;
6996     }
6997
6998   if (!non_ascii_p)
6999     {
7000       /* We are sure that the text contains no multibyte character.
7001          Check if it contains eight-bit-graphic.  */
7002       p = p1;
7003       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
7004       if (p == p1end)
7005         {
7006           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
7007           if (p == p2end)
7008             return Qt;
7009         }
7010     }
7011
7012   /* The text contains non-ASCII characters.  */
7013
7014   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
7015   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
7016
7017   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
7018                                     &single_byte_char_found);
7019   if (p2 < p2end)
7020     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
7021                                       &single_byte_char_found);
7022   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
7023     safe_codings = Qt;
7024   else
7025     {
7026       /* Turn safe_codings to a list of coding systems... */
7027       Lisp_Object val;
7028
7029       if (single_byte_char_found)
7030         /* ... and append these for eight-bit chars.  */
7031         val = Fcons (Qraw_text,
7032                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
7033       else
7034         /* ... and append generic coding systems.  */
7035         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
7036
7037       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
7038         val = Fcons (XCAR (XCAR (safe_codings)), val);
7039       safe_codings = val;
7040     }
7041
7042   return safe_codings;
7043 }
7044
7045
7046 /* Search from position POS for such characters that are unencodable
7047    accoding to SAFE_CHARS, and return a list of their positions.  P
7048    points where in the memory the character at POS exists.  Limit the
7049    search at PEND or when Nth unencodable characters are found.
7050
7051    If SAFE_CHARS is a char table, an element for an unencodable
7052    character is nil.
7053
7054    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
7055
7056    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
7057    eight-bit-graphic characters are unencodable.  */
7058
7059 static Lisp_Object
7060 unencodable_char_position (safe_chars, pos, p, pend, n)
7061      Lisp_Object safe_chars;
7062      int pos;
7063      unsigned char *p, *pend;
7064      int n;
7065 {
7066   Lisp_Object pos_list;
7067
7068   pos_list = Qnil;
7069   while (p < pend)
7070     {
7071       int len;
7072       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7073
7074       if (c >= 128
7075           && (CHAR_TABLE_P (safe_chars)
7076               ? NILP (CHAR_TABLE_REF (safe_chars, c))
7077               : (NILP (safe_chars) || c < 256)))
7078         {
7079           pos_list = Fcons (make_number (pos), pos_list);
7080           if (--n <= 0)
7081             break;
7082         }
7083       pos++;
7084       p += len;
7085     }
7086   return Fnreverse (pos_list);
7087 }
7088
7089
7090 DEFUN ("unencodable-char-position", Funencodable_char_position,
7091        Sunencodable_char_position, 3, 5, 0,
7092        doc: /*
7093 Return position of first un-encodable character in a region.
7094 START and END specfiy the region and CODING-SYSTEM specifies the
7095 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7096
7097 If optional 4th argument COUNT is non-nil, it specifies at most how
7098 many un-encodable characters to search.  In this case, the value is a
7099 list of positions.
7100
7101 If optional 5th argument STRING is non-nil, it is a string to search
7102 for un-encodable characters.  In that case, START and END are indexes
7103 to the string.  */)
7104      (start, end, coding_system, count, string)
7105      Lisp_Object start, end, coding_system, count, string;
7106 {
7107   int n;
7108   Lisp_Object safe_chars;
7109   struct coding_system coding;
7110   Lisp_Object positions;
7111   int from, to;
7112   unsigned char *p, *pend;
7113
7114   if (NILP (string))
7115     {
7116       validate_region (&start, &end);
7117       from = XINT (start);
7118       to = XINT (end);
7119       if (NILP (current_buffer->enable_multibyte_characters))
7120         return Qnil;
7121       p = CHAR_POS_ADDR (from);
7122       if (to == GPT)
7123         pend = GPT_ADDR;
7124       else
7125         pend = CHAR_POS_ADDR (to);
7126     }
7127   else
7128     {
7129       CHECK_STRING (string);
7130       CHECK_NATNUM (start);
7131       CHECK_NATNUM (end);
7132       from = XINT (start);
7133       to = XINT (end);
7134       if (from > to
7135           || to > SCHARS (string))
7136         args_out_of_range_3 (string, start, end);
7137       if (! STRING_MULTIBYTE (string))
7138         return Qnil;
7139       p = SDATA (string) + string_char_to_byte (string, from);
7140       pend = SDATA (string) + string_char_to_byte (string, to);
7141     }
7142
7143   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7144
7145   if (NILP (count))
7146     n = 1;
7147   else
7148     {
7149       CHECK_NATNUM (count);
7150       n = XINT (count);
7151     }
7152
7153   if (coding.type == coding_type_no_conversion
7154       || coding.type == coding_type_raw_text)
7155     return Qnil;
7156
7157   if (coding.type == coding_type_undecided)
7158     safe_chars = Qnil;
7159   else
7160     safe_chars = coding_safe_chars (coding_system);
7161
7162   if (STRINGP (string)
7163       || from >= GPT || to <= GPT)
7164     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7165   else
7166     {
7167       Lisp_Object args[2];
7168
7169       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7170       n -= XINT (Flength (args[0]));
7171       if (n <= 0)
7172         positions = args[0];
7173       else
7174         {
7175           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7176                                                pend, n);
7177           positions = Fappend (2, args);
7178         }
7179     }
7180
7181   return  (NILP (count) ? Fcar (positions) : positions);
7182 }
7183
7184
7185 Lisp_Object
7186 code_convert_region1 (start, end, coding_system, encodep)
7187      Lisp_Object start, end, coding_system;
7188      int encodep;
7189 {
7190   struct coding_system coding;
7191   int from, to;
7192
7193   CHECK_NUMBER_COERCE_MARKER (start);
7194   CHECK_NUMBER_COERCE_MARKER (end);
7195   CHECK_SYMBOL (coding_system);
7196
7197   validate_region (&start, &end);
7198   from = XFASTINT (start);
7199   to = XFASTINT (end);
7200
7201   if (NILP (coding_system))
7202     return make_number (to - from);
7203
7204   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7205     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7206
7207   coding.mode |= CODING_MODE_LAST_BLOCK;
7208   coding.src_multibyte = coding.dst_multibyte
7209     = !NILP (current_buffer->enable_multibyte_characters);
7210   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7211                        &coding, encodep, 1);
7212   Vlast_coding_system_used = coding.symbol;
7213   return make_number (coding.produced_char);
7214 }
7215
7216 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7217        3, 3, "r\nzCoding system: ",
7218        doc: /* Decode the current region from the specified coding system.
7219 When called from a program, takes three arguments:
7220 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7221 This function sets `last-coding-system-used' to the precise coding system
7222 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7223 not fully specified.)
7224 It returns the length of the decoded text.  */)
7225      (start, end, coding_system)
7226      Lisp_Object start, end, coding_system;
7227 {
7228   return code_convert_region1 (start, end, coding_system, 0);
7229 }
7230
7231 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7232        3, 3, "r\nzCoding system: ",
7233        doc: /* Encode the current region into the specified coding system.
7234 When called from a program, takes three arguments:
7235 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7236 This function sets `last-coding-system-used' to the precise coding system
7237 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7238 not fully specified.)
7239 It returns the length of the encoded text.  */)
7240      (start, end, coding_system)
7241      Lisp_Object start, end, coding_system;
7242 {
7243   return code_convert_region1 (start, end, coding_system, 1);
7244 }
7245
7246 Lisp_Object
7247 code_convert_string1 (string, coding_system, nocopy, encodep)
7248      Lisp_Object string, coding_system, nocopy;
7249      int encodep;
7250 {
7251   struct coding_system coding;
7252
7253   CHECK_STRING (string);
7254   CHECK_SYMBOL (coding_system);
7255
7256   if (NILP (coding_system))
7257     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7258
7259   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7260     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7261
7262   coding.mode |= CODING_MODE_LAST_BLOCK;
7263   string = (encodep
7264             ? encode_coding_string (string, &coding, !NILP (nocopy))
7265             : decode_coding_string (string, &coding, !NILP (nocopy)));
7266   Vlast_coding_system_used = coding.symbol;
7267
7268   return string;
7269 }
7270
7271 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7272        2, 3, 0,
7273        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7274 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7275 if the decoding operation is trivial.
7276 This function sets `last-coding-system-used' to the precise coding system
7277 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7278 not fully specified.)  */)
7279      (string, coding_system, nocopy)
7280      Lisp_Object string, coding_system, nocopy;
7281 {
7282   return code_convert_string1 (string, coding_system, nocopy, 0);
7283 }
7284
7285 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7286        2, 3, 0,
7287        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7288 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7289 if the encoding operation is trivial.
7290 This function sets `last-coding-system-used' to the precise coding system
7291 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7292 not fully specified.)  */)
7293      (string, coding_system, nocopy)
7294      Lisp_Object string, coding_system, nocopy;
7295 {
7296   return code_convert_string1 (string, coding_system, nocopy, 1);
7297 }
7298
7299 /* Encode or decode STRING according to CODING_SYSTEM.
7300    Do not set Vlast_coding_system_used.
7301
7302    This function is called only from macros DECODE_FILE and
7303    ENCODE_FILE, thus we ignore character composition.  */
7304
7305 Lisp_Object
7306 code_convert_string_norecord (string, coding_system, encodep)
7307      Lisp_Object string, coding_system;
7308      int encodep;
7309 {
7310   struct coding_system coding;
7311
7312   CHECK_STRING (string);
7313   CHECK_SYMBOL (coding_system);
7314
7315   if (NILP (coding_system))
7316     return string;
7317
7318   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7319     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7320
7321   coding.composing = COMPOSITION_DISABLED;
7322   coding.mode |= CODING_MODE_LAST_BLOCK;
7323   return (encodep
7324           ? encode_coding_string (string, &coding, 1)
7325           : decode_coding_string (string, &coding, 1));
7326 }
7327 \f
7328 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7329        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7330 Return the corresponding character.  */)
7331      (code)
7332      Lisp_Object code;
7333 {
7334   unsigned char c1, c2, s1, s2;
7335   Lisp_Object val;
7336
7337   CHECK_NUMBER (code);
7338   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7339   if (s1 == 0)
7340     {
7341       if (s2 < 0x80)
7342         XSETFASTINT (val, s2);
7343       else if (s2 >= 0xA0 || s2 <= 0xDF)
7344         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7345       else
7346         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7347     }
7348   else
7349     {
7350       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7351           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7352         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7353       DECODE_SJIS (s1, s2, c1, c2);
7354       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7355     }
7356   return val;
7357 }
7358
7359 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7360        doc: /* Encode a Japanese character CH to shift_jis encoding.
7361 Return the corresponding code in SJIS.  */)
7362      (ch)
7363      Lisp_Object ch;
7364 {
7365   int charset, c1, c2, s1, s2;
7366   Lisp_Object val;
7367
7368   CHECK_NUMBER (ch);
7369   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7370   if (charset == CHARSET_ASCII)
7371     {
7372       val = ch;
7373     }
7374   else if (charset == charset_jisx0208
7375            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7376     {
7377       ENCODE_SJIS (c1, c2, s1, s2);
7378       XSETFASTINT (val, (s1 << 8) | s2);
7379     }
7380   else if (charset == charset_katakana_jisx0201
7381            && c1 > 0x20 && c2 < 0xE0)
7382     {
7383       XSETFASTINT (val, c1 | 0x80);
7384     }
7385   else
7386     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7387   return val;
7388 }
7389
7390 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7391        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7392 Return the corresponding character.  */)
7393      (code)
7394      Lisp_Object code;
7395 {
7396   int charset;
7397   unsigned char b1, b2, c1, c2;
7398   Lisp_Object val;
7399
7400   CHECK_NUMBER (code);
7401   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7402   if (b1 == 0)
7403     {
7404       if (b2 >= 0x80)
7405         error ("Invalid BIG5 code: %x", XFASTINT (code));
7406       val = code;
7407     }
7408   else
7409     {
7410       if ((b1 < 0xA1 || b1 > 0xFE)
7411           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7412         error ("Invalid BIG5 code: %x", XFASTINT (code));
7413       DECODE_BIG5 (b1, b2, charset, c1, c2);
7414       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7415     }
7416   return val;
7417 }
7418
7419 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7420        doc: /* Encode the Big5 character CH to BIG5 coding system.
7421 Return the corresponding character code in Big5.  */)
7422      (ch)
7423      Lisp_Object ch;
7424 {
7425   int charset, c1, c2, b1, b2;
7426   Lisp_Object val;
7427
7428   CHECK_NUMBER (ch);
7429   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7430   if (charset == CHARSET_ASCII)
7431     {
7432       val = ch;
7433     }
7434   else if ((charset == charset_big5_1
7435             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7436            || (charset == charset_big5_2
7437                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7438     {
7439       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7440       XSETFASTINT (val, (b1 << 8) | b2);
7441     }
7442   else
7443     error ("Can't encode to Big5: %d", XFASTINT (ch));
7444   return val;
7445 }
7446 \f
7447 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7448        Sset_terminal_coding_system_internal, 1, 1, 0,
7449        doc: /* Internal use only.  */)
7450      (coding_system)
7451      Lisp_Object coding_system;
7452 {
7453   CHECK_SYMBOL (coding_system);
7454   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7455   /* We had better not send unsafe characters to terminal.  */
7456   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7457   /* Character composition should be disabled.  */
7458   terminal_coding.composing = COMPOSITION_DISABLED;
7459   /* Error notification should be suppressed.  */
7460   terminal_coding.suppress_error = 1;
7461   terminal_coding.src_multibyte = 1;
7462   terminal_coding.dst_multibyte = 0;
7463   return Qnil;
7464 }
7465
7466 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7467        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7468        doc: /* Internal use only.  */)
7469      (coding_system)
7470      Lisp_Object coding_system;
7471 {
7472   CHECK_SYMBOL (coding_system);
7473   setup_coding_system (Fcheck_coding_system (coding_system),
7474                        &safe_terminal_coding);
7475   /* Character composition should be disabled.  */
7476   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7477   /* Error notification should be suppressed.  */
7478   safe_terminal_coding.suppress_error = 1;
7479   safe_terminal_coding.src_multibyte = 1;
7480   safe_terminal_coding.dst_multibyte = 0;
7481   return Qnil;
7482 }
7483
7484 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7485        Sterminal_coding_system, 0, 0, 0,
7486        doc: /* Return coding system specified for terminal output.  */)
7487      ()
7488 {
7489   return terminal_coding.symbol;
7490 }
7491
7492 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7493        Sset_keyboard_coding_system_internal, 1, 1, 0,
7494        doc: /* Internal use only.  */)
7495      (coding_system)
7496      Lisp_Object coding_system;
7497 {
7498   CHECK_SYMBOL (coding_system);
7499   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7500   /* Character composition should be disabled.  */
7501   keyboard_coding.composing = COMPOSITION_DISABLED;
7502   return Qnil;
7503 }
7504
7505 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7506        Skeyboard_coding_system, 0, 0, 0,
7507        doc: /* Return coding system specified for decoding keyboard input.  */)
7508      ()
7509 {
7510   return keyboard_coding.symbol;
7511 }
7512
7513 \f
7514 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7515        Sfind_operation_coding_system,  1, MANY, 0,
7516        doc: /* Choose a coding system for an operation based on the target name.
7517 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7518 DECODING-SYSTEM is the coding system to use for decoding
7519 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7520 for encoding (in case OPERATION does encoding).
7521
7522 The first argument OPERATION specifies an I/O primitive:
7523   For file I/O, `insert-file-contents' or `write-region'.
7524   For process I/O, `call-process', `call-process-region', or `start-process'.
7525   For network I/O, `open-network-stream'.
7526
7527 The remaining arguments should be the same arguments that were passed
7528 to the primitive.  Depending on which primitive, one of those arguments
7529 is selected as the TARGET.  For example, if OPERATION does file I/O,
7530 whichever argument specifies the file name is TARGET.
7531
7532 TARGET has a meaning which depends on OPERATION:
7533   For file I/O, TARGET is a file name (except for the special case below).
7534   For process I/O, TARGET is a process name.
7535   For network I/O, TARGET is a service name or a port number
7536
7537 This function looks up what specified for TARGET in,
7538 `file-coding-system-alist', `process-coding-system-alist',
7539 or `network-coding-system-alist' depending on OPERATION.
7540 They may specify a coding system, a cons of coding systems,
7541 or a function symbol to call.
7542 In the last case, we call the function with one argument,
7543 which is a list of all the arguments given to this function.
7544
7545 If OPERATION is `insert-file-contents', the argument corresponding to
7546 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
7547 file name to look up, and BUFFER is a buffer that contains the file's
7548 contents (not yet decoded).  If `file-coding-system-alist' specifies a
7549 function to call for FILENAME, that function should examine the
7550 contents of BUFFER instead of reading the file.
7551
7552 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
7553      (nargs, args)
7554      int nargs;
7555      Lisp_Object *args;
7556 {
7557   Lisp_Object operation, target_idx, target, val;
7558   register Lisp_Object chain;
7559
7560   if (nargs < 2)
7561     error ("Too few arguments");
7562   operation = args[0];
7563   if (!SYMBOLP (operation)
7564       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7565     error ("Invalid first argument");
7566   if (nargs < 1 + XINT (target_idx))
7567     error ("Too few arguments for operation: %s",
7568            SDATA (SYMBOL_NAME (operation)));
7569   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7570      argument to write-region) is string, it must be treated as a
7571      target file name.  */
7572   if (EQ (operation, Qwrite_region)
7573       && nargs > 5
7574       && STRINGP (args[5]))
7575     target_idx = make_number (4);
7576   target = args[XINT (target_idx) + 1];
7577   if (!(STRINGP (target)
7578         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7579             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7580         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7581     error ("Invalid argument %d", XINT (target_idx) + 1);
7582   if (CONSP (target))
7583     target = XCAR (target);
7584
7585   chain = ((EQ (operation, Qinsert_file_contents)
7586             || EQ (operation, Qwrite_region))
7587            ? Vfile_coding_system_alist
7588            : (EQ (operation, Qopen_network_stream)
7589               ? Vnetwork_coding_system_alist
7590               : Vprocess_coding_system_alist));
7591   if (NILP (chain))
7592     return Qnil;
7593
7594   for (; CONSP (chain); chain = XCDR (chain))
7595     {
7596       Lisp_Object elt;
7597       elt = XCAR (chain);
7598
7599       if (CONSP (elt)
7600           && ((STRINGP (target)
7601                && STRINGP (XCAR (elt))
7602                && fast_string_match (XCAR (elt), target) >= 0)
7603               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7604         {
7605           val = XCDR (elt);
7606           /* Here, if VAL is both a valid coding system and a valid
7607              function symbol, we return VAL as a coding system.  */
7608           if (CONSP (val))
7609             return val;
7610           if (! SYMBOLP (val))
7611             return Qnil;
7612           if (! NILP (Fcoding_system_p (val)))
7613             return Fcons (val, val);
7614           if (! NILP (Ffboundp (val)))
7615             {
7616               /* We use call1 rather than safe_call1
7617                  so as to get bug reports about functions called here
7618                  which don't handle the current interface.  */
7619               val = call1 (val, Flist (nargs, args));
7620               if (CONSP (val))
7621                 return val;
7622               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7623                 return Fcons (val, val);
7624             }
7625           return Qnil;
7626         }
7627     }
7628   return Qnil;
7629 }
7630
7631 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7632        Supdate_coding_systems_internal, 0, 0, 0,
7633        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7634 When values of any coding categories are changed, you must
7635 call this function.  */)
7636      ()
7637 {
7638   int i;
7639
7640   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7641     {
7642       Lisp_Object val;
7643
7644       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7645       if (!NILP (val))
7646         {
7647           if (! coding_system_table[i])
7648             coding_system_table[i] = ((struct coding_system *)
7649                                       xmalloc (sizeof (struct coding_system)));
7650           setup_coding_system (val, coding_system_table[i]);
7651         }
7652       else if (coding_system_table[i])
7653         {
7654           xfree (coding_system_table[i]);
7655           coding_system_table[i] = NULL;
7656         }
7657     }
7658
7659   return Qnil;
7660 }
7661
7662 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7663        Sset_coding_priority_internal, 0, 0, 0,
7664        doc: /* Update internal database for the current value of `coding-category-list'.
7665 This function is internal use only.  */)
7666      ()
7667 {
7668   int i = 0, idx;
7669   Lisp_Object val;
7670
7671   val = Vcoding_category_list;
7672
7673   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7674     {
7675       if (! SYMBOLP (XCAR (val)))
7676         break;
7677       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7678       if (idx >= CODING_CATEGORY_IDX_MAX)
7679         break;
7680       coding_priorities[i++] = (1 << idx);
7681       val = XCDR (val);
7682     }
7683   /* If coding-category-list is valid and contains all coding
7684      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7685      the following code saves Emacs from crashing.  */
7686   while (i < CODING_CATEGORY_IDX_MAX)
7687     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7688
7689   return Qnil;
7690 }
7691
7692 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7693        Sdefine_coding_system_internal, 1, 1, 0,
7694        doc: /* Register CODING-SYSTEM as a base coding system.
7695 This function is internal use only.  */)
7696      (coding_system)
7697      Lisp_Object coding_system;
7698 {
7699   Lisp_Object safe_chars, slot;
7700
7701   if (NILP (Fcheck_coding_system (coding_system)))
7702     xsignal1 (Qcoding_system_error, coding_system);
7703
7704   safe_chars = coding_safe_chars (coding_system);
7705   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7706     error ("No valid safe-chars property for %s",
7707            SDATA (SYMBOL_NAME (coding_system)));
7708
7709   if (EQ (safe_chars, Qt))
7710     {
7711       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7712         XSETCAR (Vcoding_system_safe_chars,
7713                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7714     }
7715   else
7716     {
7717       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7718       if (NILP (slot))
7719         XSETCDR (Vcoding_system_safe_chars,
7720                  nconc2 (XCDR (Vcoding_system_safe_chars),
7721                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7722       else
7723         XSETCDR (slot, safe_chars);
7724     }
7725   return Qnil;
7726 }
7727
7728 #endif /* emacs */
7729
7730 \f
7731 /*** 9. Post-amble ***/
7732
7733 void
7734 init_coding_once ()
7735 {
7736   int i;
7737
7738   /* Emacs' internal format specific initialize routine.  */
7739   for (i = 0; i <= 0x20; i++)
7740     emacs_code_class[i] = EMACS_control_code;
7741   emacs_code_class[0x0A] = EMACS_linefeed_code;
7742   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7743   for (i = 0x21 ; i < 0x7F; i++)
7744     emacs_code_class[i] = EMACS_ascii_code;
7745   emacs_code_class[0x7F] = EMACS_control_code;
7746   for (i = 0x80; i < 0xFF; i++)
7747     emacs_code_class[i] = EMACS_invalid_code;
7748   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7749   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7750   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7751   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7752
7753   /* ISO2022 specific initialize routine.  */
7754   for (i = 0; i < 0x20; i++)
7755     iso_code_class[i] = ISO_control_0;
7756   for (i = 0x21; i < 0x7F; i++)
7757     iso_code_class[i] = ISO_graphic_plane_0;
7758   for (i = 0x80; i < 0xA0; i++)
7759     iso_code_class[i] = ISO_control_1;
7760   for (i = 0xA1; i < 0xFF; i++)
7761     iso_code_class[i] = ISO_graphic_plane_1;
7762   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7763   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7764   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7765   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7766   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7767   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7768   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7769   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7770   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7771   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7772
7773   setup_coding_system (Qnil, &keyboard_coding);
7774   setup_coding_system (Qnil, &terminal_coding);
7775   setup_coding_system (Qnil, &safe_terminal_coding);
7776   setup_coding_system (Qnil, &default_buffer_file_coding);
7777
7778   bzero (coding_system_table, sizeof coding_system_table);
7779
7780   bzero (ascii_skip_code, sizeof ascii_skip_code);
7781   for (i = 0; i < 128; i++)
7782     ascii_skip_code[i] = 1;
7783
7784 #if defined (MSDOS) || defined (WINDOWSNT)
7785   system_eol_type = CODING_EOL_CRLF;
7786 #else
7787   system_eol_type = CODING_EOL_LF;
7788 #endif
7789
7790   inhibit_pre_post_conversion = 0;
7791 }
7792
7793 #ifdef emacs
7794
7795 void
7796 syms_of_coding ()
7797 {
7798   staticpro (&Vcode_conversion_workbuf_name);
7799   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7800
7801   Qtarget_idx = intern ("target-idx");
7802   staticpro (&Qtarget_idx);
7803
7804   Qcoding_system_history = intern ("coding-system-history");
7805   staticpro (&Qcoding_system_history);
7806   Fset (Qcoding_system_history, Qnil);
7807
7808   /* Target FILENAME is the first argument.  */
7809   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7810   /* Target FILENAME is the third argument.  */
7811   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7812
7813   Qcall_process = intern ("call-process");
7814   staticpro (&Qcall_process);
7815   /* Target PROGRAM is the first argument.  */
7816   Fput (Qcall_process, Qtarget_idx, make_number (0));
7817
7818   Qcall_process_region = intern ("call-process-region");
7819   staticpro (&Qcall_process_region);
7820   /* Target PROGRAM is the third argument.  */
7821   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7822
7823   Qstart_process = intern ("start-process");
7824   staticpro (&Qstart_process);
7825   /* Target PROGRAM is the third argument.  */
7826   Fput (Qstart_process, Qtarget_idx, make_number (2));
7827
7828   Qopen_network_stream = intern ("open-network-stream");
7829   staticpro (&Qopen_network_stream);
7830   /* Target SERVICE is the fourth argument.  */
7831   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7832
7833   Qcoding_system = intern ("coding-system");
7834   staticpro (&Qcoding_system);
7835
7836   Qeol_type = intern ("eol-type");
7837   staticpro (&Qeol_type);
7838
7839   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7840   staticpro (&Qbuffer_file_coding_system);
7841
7842   Qpost_read_conversion = intern ("post-read-conversion");
7843   staticpro (&Qpost_read_conversion);
7844
7845   Qpre_write_conversion = intern ("pre-write-conversion");
7846   staticpro (&Qpre_write_conversion);
7847
7848   Qno_conversion = intern ("no-conversion");
7849   staticpro (&Qno_conversion);
7850
7851   Qundecided = intern ("undecided");
7852   staticpro (&Qundecided);
7853
7854   Qcoding_system_p = intern ("coding-system-p");
7855   staticpro (&Qcoding_system_p);
7856
7857   Qcoding_system_error = intern ("coding-system-error");
7858   staticpro (&Qcoding_system_error);
7859
7860   Fput (Qcoding_system_error, Qerror_conditions,
7861         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7862   Fput (Qcoding_system_error, Qerror_message,
7863         build_string ("Invalid coding system"));
7864
7865   Qcoding_category = intern ("coding-category");
7866   staticpro (&Qcoding_category);
7867   Qcoding_category_index = intern ("coding-category-index");
7868   staticpro (&Qcoding_category_index);
7869
7870   Vcoding_category_table
7871     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7872   staticpro (&Vcoding_category_table);
7873   {
7874     int i;
7875     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7876       {
7877         XVECTOR (Vcoding_category_table)->contents[i]
7878           = intern (coding_category_name[i]);
7879         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7880               Qcoding_category_index, make_number (i));
7881       }
7882   }
7883
7884   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7885   staticpro (&Vcoding_system_safe_chars);
7886
7887   Qtranslation_table = intern ("translation-table");
7888   staticpro (&Qtranslation_table);
7889   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7890
7891   Qtranslation_table_id = intern ("translation-table-id");
7892   staticpro (&Qtranslation_table_id);
7893
7894   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7895   staticpro (&Qtranslation_table_for_decode);
7896
7897   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7898   staticpro (&Qtranslation_table_for_encode);
7899
7900   Qsafe_chars = intern ("safe-chars");
7901   staticpro (&Qsafe_chars);
7902
7903   Qchar_coding_system = intern ("char-coding-system");
7904   staticpro (&Qchar_coding_system);
7905
7906   /* Intern this now in case it isn't already done.
7907      Setting this variable twice is harmless.
7908      But don't staticpro it here--that is done in alloc.c.  */
7909   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7910   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7911   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7912
7913   Qvalid_codes = intern ("valid-codes");
7914   staticpro (&Qvalid_codes);
7915
7916   Qascii_incompatible = intern ("ascii-incompatible");
7917   staticpro (&Qascii_incompatible);
7918
7919   Qemacs_mule = intern ("emacs-mule");
7920   staticpro (&Qemacs_mule);
7921
7922   Qraw_text = intern ("raw-text");
7923   staticpro (&Qraw_text);
7924
7925   Qutf_8 = intern ("utf-8");
7926   staticpro (&Qutf_8);
7927
7928   Qcoding_system_define_form = intern ("coding-system-define-form");
7929   staticpro (&Qcoding_system_define_form);
7930
7931   defsubr (&Scoding_system_p);
7932   defsubr (&Sread_coding_system);
7933   defsubr (&Sread_non_nil_coding_system);
7934   defsubr (&Scheck_coding_system);
7935   defsubr (&Sdetect_coding_region);
7936   defsubr (&Sdetect_coding_string);
7937   defsubr (&Sfind_coding_systems_region_internal);
7938   defsubr (&Sunencodable_char_position);
7939   defsubr (&Sdecode_coding_region);
7940   defsubr (&Sencode_coding_region);
7941   defsubr (&Sdecode_coding_string);
7942   defsubr (&Sencode_coding_string);
7943   defsubr (&Sdecode_sjis_char);
7944   defsubr (&Sencode_sjis_char);
7945   defsubr (&Sdecode_big5_char);
7946   defsubr (&Sencode_big5_char);
7947   defsubr (&Sset_terminal_coding_system_internal);
7948   defsubr (&Sset_safe_terminal_coding_system_internal);
7949   defsubr (&Sterminal_coding_system);
7950   defsubr (&Sset_keyboard_coding_system_internal);
7951   defsubr (&Skeyboard_coding_system);
7952   defsubr (&Sfind_operation_coding_system);
7953   defsubr (&Supdate_coding_systems_internal);
7954   defsubr (&Sset_coding_priority_internal);
7955   defsubr (&Sdefine_coding_system_internal);
7956
7957   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7958                doc: /* List of coding systems.
7959
7960 Do not alter the value of this variable manually.  This variable should be
7961 updated by the functions `make-coding-system' and
7962 `define-coding-system-alias'.  */);
7963   Vcoding_system_list = Qnil;
7964
7965   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7966                doc: /* Alist of coding system names.
7967 Each element is one element list of coding system name.
7968 This variable is given to `completing-read' as TABLE argument.
7969
7970 Do not alter the value of this variable manually.  This variable should be
7971 updated by the functions `make-coding-system' and
7972 `define-coding-system-alias'.  */);
7973   Vcoding_system_alist = Qnil;
7974
7975   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7976                doc: /* List of coding-categories (symbols) ordered by priority.
7977
7978 On detecting a coding system, Emacs tries code detection algorithms
7979 associated with each coding-category one by one in this order.  When
7980 one algorithm agrees with a byte sequence of source text, the coding
7981 system bound to the corresponding coding-category is selected.
7982
7983 Don't modify this variable directly, but use `set-coding-priority'.  */);
7984   {
7985     int i;
7986
7987     Vcoding_category_list = Qnil;
7988     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7989       Vcoding_category_list
7990         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7991                  Vcoding_category_list);
7992   }
7993
7994   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7995                doc: /* Specify the coding system for read operations.
7996 It is useful to bind this variable with `let', but do not set it globally.
7997 If the value is a coding system, it is used for decoding on read operation.
7998 If not, an appropriate element is used from one of the coding system alists:
7999 There are three such tables, `file-coding-system-alist',
8000 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8001   Vcoding_system_for_read = Qnil;
8002
8003   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8004                doc: /* Specify the coding system for write operations.
8005 Programs bind this variable with `let', but you should not set it globally.
8006 If the value is a coding system, it is used for encoding of output,
8007 when writing it to a file and when sending it to a file or subprocess.
8008
8009 If this does not specify a coding system, an appropriate element
8010 is used from one of the coding system alists:
8011 There are three such tables, `file-coding-system-alist',
8012 `process-coding-system-alist', and `network-coding-system-alist'.
8013 For output to files, if the above procedure does not specify a coding system,
8014 the value of `buffer-file-coding-system' is used.  */);
8015   Vcoding_system_for_write = Qnil;
8016
8017   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8018                doc: /* Coding system used in the latest file or process I/O.
8019 Also set by `encode-coding-region', `decode-coding-region',
8020 `encode-coding-string' and `decode-coding-string'.  */);
8021   Vlast_coding_system_used = Qnil;
8022
8023   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8024                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
8025 See info node `Coding Systems' and info node `Text and Binary' concerning
8026 such conversion.  */);
8027   inhibit_eol_conversion = 0;
8028
8029   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8030                doc: /* Non-nil means process buffer inherits coding system of process output.
8031 Bind it to t if the process output is to be treated as if it were a file
8032 read from some filesystem.  */);
8033   inherit_process_coding_system = 0;
8034
8035   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8036                doc: /* Alist to decide a coding system to use for a file I/O operation.
8037 The format is ((PATTERN . VAL) ...),
8038 where PATTERN is a regular expression matching a file name,
8039 VAL is a coding system, a cons of coding systems, or a function symbol.
8040 If VAL is a coding system, it is used for both decoding and encoding
8041 the file contents.
8042 If VAL is a cons of coding systems, the car part is used for decoding,
8043 and the cdr part is used for encoding.
8044 If VAL is a function symbol, the function must return a coding system
8045 or a cons of coding systems which are used as above.  The function is
8046 called with an argument that is a list of the arguments with which
8047 `find-operation-coding-system' was called.
8048
8049 See also the function `find-operation-coding-system'
8050 and the variable `auto-coding-alist'.  */);
8051   Vfile_coding_system_alist = Qnil;
8052
8053   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8054     doc: /* Alist to decide a coding system to use for a process I/O operation.
8055 The format is ((PATTERN . VAL) ...),
8056 where PATTERN is a regular expression matching a program name,
8057 VAL is a coding system, a cons of coding systems, or a function symbol.
8058 If VAL is a coding system, it is used for both decoding what received
8059 from the program and encoding what sent to the program.
8060 If VAL is a cons of coding systems, the car part is used for decoding,
8061 and the cdr part is used for encoding.
8062 If VAL is a function symbol, the function must return a coding system
8063 or a cons of coding systems which are used as above.
8064
8065 See also the function `find-operation-coding-system'.  */);
8066   Vprocess_coding_system_alist = Qnil;
8067
8068   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8069     doc: /* Alist to decide a coding system to use for a network I/O operation.
8070 The format is ((PATTERN . VAL) ...),
8071 where PATTERN is a regular expression matching a network service name
8072 or is a port number to connect to,
8073 VAL is a coding system, a cons of coding systems, or a function symbol.
8074 If VAL is a coding system, it is used for both decoding what received
8075 from the network stream and encoding what sent to the network stream.
8076 If VAL is a cons of coding systems, the car part is used for decoding,
8077 and the cdr part is used for encoding.
8078 If VAL is a function symbol, the function must return a coding system
8079 or a cons of coding systems which are used as above.
8080
8081 See also the function `find-operation-coding-system'.  */);
8082   Vnetwork_coding_system_alist = Qnil;
8083
8084   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8085                doc: /* Coding system to use with system messages.
8086 Also used for decoding keyboard input on X Window system.  */);
8087   Vlocale_coding_system = Qnil;
8088
8089   /* The eol mnemonics are reset in startup.el system-dependently.  */
8090   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8091                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8092   eol_mnemonic_unix = build_string (":");
8093
8094   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8095                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8096   eol_mnemonic_dos = build_string ("\\");
8097
8098   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8099                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8100   eol_mnemonic_mac = build_string ("/");
8101
8102   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8103                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8104   eol_mnemonic_undecided = build_string (":");
8105
8106   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8107                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8108   Venable_character_translation = Qt;
8109
8110   DEFVAR_LISP ("standard-translation-table-for-decode",
8111                &Vstandard_translation_table_for_decode,
8112                doc: /* Table for translating characters while decoding.  */);
8113   Vstandard_translation_table_for_decode = Qnil;
8114
8115   DEFVAR_LISP ("standard-translation-table-for-encode",
8116                &Vstandard_translation_table_for_encode,
8117                doc: /* Table for translating characters while encoding.  */);
8118   Vstandard_translation_table_for_encode = Qnil;
8119
8120   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8121                doc: /* Alist of charsets vs revision numbers.
8122 While encoding, if a charset (car part of an element) is found,
8123 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8124   Vcharset_revision_alist = Qnil;
8125
8126   DEFVAR_LISP ("default-process-coding-system",
8127                &Vdefault_process_coding_system,
8128                doc: /* Cons of coding systems used for process I/O by default.
8129 The car part is used for decoding a process output,
8130 the cdr part is used for encoding a text to be sent to a process.  */);
8131   Vdefault_process_coding_system = Qnil;
8132
8133   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8134                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8135 This is a vector of length 256.
8136 If Nth element is non-nil, the existence of code N in a file
8137 \(or output of subprocess) doesn't prevent it to be detected as
8138 a coding system of ISO 2022 variant which has a flag
8139 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8140 or reading output of a subprocess.
8141 Only 128th through 159th elements has a meaning.  */);
8142   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8143
8144   DEFVAR_LISP ("select-safe-coding-system-function",
8145                &Vselect_safe_coding_system_function,
8146                doc: /* Function to call to select safe coding system for encoding a text.
8147
8148 If set, this function is called to force a user to select a proper
8149 coding system which can encode the text in the case that a default
8150 coding system used in each operation can't encode the text.
8151
8152 The default value is `select-safe-coding-system' (which see).  */);
8153   Vselect_safe_coding_system_function = Qnil;
8154
8155   DEFVAR_BOOL ("coding-system-require-warning",
8156                &coding_system_require_warning,
8157                doc: /* Internal use only.
8158 If non-nil, on writing a file, `select-safe-coding-system-function' is
8159 called even if `coding-system-for-write' is non-nil.  The command
8160 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8161   coding_system_require_warning = 0;
8162
8163
8164   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8165                &inhibit_iso_escape_detection,
8166                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8167
8168 By default, on reading a file, Emacs tries to detect how the text is
8169 encoded.  This code detection is sensitive to escape sequences.  If
8170 the sequence is valid as ISO2022, the code is determined as one of
8171 the ISO2022 encodings, and the file is decoded by the corresponding
8172 coding system (e.g. `iso-2022-7bit').
8173
8174 However, there may be a case that you want to read escape sequences in
8175 a file as is.  In such a case, you can set this variable to non-nil.
8176 Then, as the code detection ignores any escape sequences, no file is
8177 detected as encoded in some ISO2022 encoding.  The result is that all
8178 escape sequences become visible in a buffer.
8179
8180 The default value is nil, and it is strongly recommended not to change
8181 it.  That is because many Emacs Lisp source files that contain
8182 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8183 in Emacs's distribution, and they won't be decoded correctly on
8184 reading if you suppress escape sequence detection.
8185
8186 The other way to read escape sequences in a file without decoding is
8187 to explicitly specify some coding system that doesn't use ISO2022's
8188 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8189   inhibit_iso_escape_detection = 0;
8190
8191   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8192                doc: /* Char table for translating self-inserting characters.
8193 This is applied to the result of input methods, not their input.  See also
8194 `keyboard-translate-table'.  */);
8195     Vtranslation_table_for_input = Qnil;
8196 }
8197
8198 char *
8199 emacs_strerror (error_number)
8200      int error_number;
8201 {
8202   char *str;
8203
8204   synchronize_system_messages_locale ();
8205   str = strerror (error_number);
8206
8207   if (! NILP (Vlocale_coding_system))
8208     {
8209       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8210                                                       Vlocale_coding_system,
8211                                                       0);
8212       str = (char *) SDATA (dec);
8213     }
8214
8215   return str;
8216 }
8217
8218 #endif /* emacs */
8219
8220 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8221    (do not change this comment) */