src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 3, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  24 Boston, MA 02110-1301, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-mule) handlers
  31   3. ISO2022 handlers
  32   4. Shift-JIS and BIG5 handlers
  33   5. CCL handlers
  34   6. End-of-line handlers
  35   7. C library functions
  36   8. Emacs Lisp library functions
  37   9. Post-amble
  38
  39 */
  40
  41 /*** 0. General comments ***/
  42
  43
  44 /*** GENERAL NOTE on CODING SYSTEMS ***
  45
  46   A coding system is an encoding mechanism for one or more character
  47   sets.  Here's a list of coding systems which Emacs can handle.  When
  48   we say "decode", it means converting some other coding system to
  49   Emacs' internal format (emacs-mule), and when we say "encode",
  50   it means converting the coding system emacs-mule to some other
  51   coding system.
  52
  53   0. Emacs' internal format (emacs-mule)
  54
  55   Emacs itself holds a multi-lingual character in buffers and strings
  56   in a special format.  Details are described in section 2.
  57
  58   1. ISO2022
  59
  60   The most famous coding system for multiple character sets.  X's
  61   Compound Text, various EUCs (Extended Unix Code), and coding
  62   systems used in Internet communication such as ISO-2022-JP are
  63   all variants of ISO2022.  Details are described in section 3.
  64
  65   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  66
  67   A coding system to encode character sets: ASCII, JISX0201, and
  68   JISX0208.  Widely used for PC's in Japan.  Details are described in
  69   section 4.
  70
  71   3. BIG5
  72
  73   A coding system to encode the character sets ASCII and Big5.  Widely
  74   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  75   described in section 4.  In this file, when we write "BIG5"
  76   (all uppercase), we mean the coding system, and when we write
  77   "Big5" (capitalized), we mean the character set.
  78
  79   4. Raw text
  80
  81   A coding system for text containing random 8-bit code.  Emacs does
  82   no code conversion on such text except for end-of-line format.
  83
  84   5. Other
  85
  86   If a user wants to read/write text encoded in a coding system not
  87   listed above, he can supply a decoder and an encoder for it as CCL
  88   (Code Conversion Language) programs.  Emacs executes the CCL program
  89   while reading/writing.
  90
  91   Emacs represents a coding system by a Lisp symbol that has a property
  92   `coding-system'.  But, before actually using the coding system, the
  93   information about it is set in a structure of type `struct
  94   coding_system' for rapid processing.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  99
 100   How end-of-line of text is encoded depends on the operating system.
 101   For instance, Unix's format is just one byte of `line-feed' code,
 102   whereas DOS's format is two-byte sequence of `carriage-return' and
 103   `line-feed' codes.  MacOS's format is usually one byte of
 104   `carriage-return'.
 105
 106   Since text character encoding and end-of-line encoding are
 107   independent, any coding system described above can have any
 108   end-of-line format.  So Emacs has information about end-of-line
 109   format in each coding-system.  See section 6 for more details.
 110
 111 */
 112
 113 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 114
 115   These functions check if a text between SRC and SRC_END is encoded
 116   in the coding system category XXX.  Each returns an integer value in
 117   which appropriate flag bits for the category XXX are set.  The flag
 118   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 119   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 120   of the range 0x80..0x9F are in multibyte form.  */
 121 #if 0
 122 int
 123 detect_coding_emacs_mule (src, src_end, multibytep)
 124      unsigned char *src, *src_end;
 125      int multibytep;
 126 {
 127   ...
 128 }
 129 #endif
 130
 131 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 132
 133   These functions decode SRC_BYTES length of unibyte text at SOURCE
 134   encoded in CODING to Emacs' internal format.  The resulting
 135   multibyte text goes to a place pointed to by DESTINATION, the length
 136   of which should not exceed DST_BYTES.
 137
 138   These functions set the information about original and decoded texts
 139   in the members `produced', `produced_char', `consumed', and
 140   `consumed_char' of the structure *CODING.  They also set the member
 141   `result' to one of CODING_FINISH_XXX indicating how the decoding
 142   finished.
 143
 144   DST_BYTES zero means that the source area and destination area are
 145   overlapped, which means that we can produce a decoded text until it
 146   reaches the head of the not-yet-decoded source text.
 147
 148   Below is a template for these functions.  */
 149 #if 0
 150 static void
 151 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 152      struct coding_system *coding;
 153      const unsigned char *source;
 154      unsigned char *destination;
 155      int src_bytes, dst_bytes;
 156 {
 157   ...
 158 }
 159 #endif
 160
 161 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 162
 163   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 164   internal multibyte format to CODING.  The resulting unibyte text
 165   goes to a place pointed to by DESTINATION, the length of which
 166   should not exceed DST_BYTES.
 167
 168   These functions set the information about original and encoded texts
 169   in the members `produced', `produced_char', `consumed', and
 170   `consumed_char' of the structure *CODING.  They also set the member
 171   `result' to one of CODING_FINISH_XXX indicating how the encoding
 172   finished.
 173
 174   DST_BYTES zero means that the source area and destination area are
 175   overlapped, which means that we can produce encoded text until it
 176   reaches at the head of the not-yet-encoded source text.
 177
 178   Below is a template for these functions.  */
 179 #if 0
 180 static void
 181 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 182      struct coding_system *coding;
 183      unsigned char *source, *destination;
 184      int src_bytes, dst_bytes;
 185 {
 186   ...
 187 }
 188 #endif
 189
 190 /*** COMMONLY USED MACROS ***/
 191
 192 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 193    get one, two, and three bytes from the source text respectively.
 194    If there are not enough bytes in the source, they jump to
 195    `label_end_of_loop'.  The caller should set variables `coding',
 196    `src' and `src_end' to appropriate pointer in advance.  These
 197    macros are called from decoding routines `decode_coding_XXX', thus
 198    it is assumed that the source text is unibyte.  */
 199
 200 #define ONE_MORE_BYTE(c1)                                       \
 201   do {                                                          \
 202     if (src >= src_end)                                         \
 203       {                                                         \
 204         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 205         goto label_end_of_loop;                                 \
 206       }                                                         \
 207     c1 = *src++;                                                \
 208   } while (0)
 209
 210 #define TWO_MORE_BYTES(c1, c2)                                  \
 211   do {                                                          \
 212     if (src + 1 >= src_end)                                     \
 213       {                                                         \
 214         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 215         goto label_end_of_loop;                                 \
 216       }                                                         \
 217     c1 = *src++;                                                \
 218     c2 = *src++;                                                \
 219   } while (0)
 220
 221
 222 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 223    form if MULTIBYTEP is nonzero.  In addition, if SRC is not less
 224    than SRC_END, return with RET.  */
 225
 226 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret)      \
 227   do {                                                          \
 228     if (src >= src_end)                                         \
 229       {                                                         \
 230         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 231         return ret;                                             \
 232       }                                                         \
 233     c1 = *src++;                                                \
 234     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 235       c1 = *src++ - 0x20;                                       \
 236   } while (0)
 237
 238 /* Set C to the next character at the source text pointed by `src'.
 239    If there are not enough characters in the source, jump to
 240    `label_end_of_loop'.  The caller should set variables `coding'
 241    `src', `src_end', and `translation_table' to appropriate pointers
 242    in advance.  This macro is used in encoding routines
 243    `encode_coding_XXX', thus it assumes that the source text is in
 244    multibyte form except for 8-bit characters.  8-bit characters are
 245    in multibyte form if coding->src_multibyte is nonzero, else they
 246    are represented by a single byte.  */
 247
 248 #define ONE_MORE_CHAR(c)                                        \
 249   do {                                                          \
 250     int len = src_end - src;                                    \
 251     int bytes;                                                  \
 252     if (len <= 0)                                               \
 253       {                                                         \
 254         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 255         goto label_end_of_loop;                                 \
 256       }                                                         \
 257     if (coding->src_multibyte                                   \
 258         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 259       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 260     else                                                        \
 261       c = *src, bytes = 1;                                      \
 262     if (!NILP (translation_table))                              \
 263       c = translate_char (translation_table, c, -1, 0, 0);      \
 264     src += bytes;                                               \
 265   } while (0)
 266
 267
 268 /* Produce a multibyte form of character C to `dst'.  Jump to
 269    `label_end_of_loop' if there's not enough space at `dst'.
 270
 271    If we are now in the middle of a composition sequence, the decoded
 272    character may be ALTCHAR (for the current composition).  In that
 273    case, the character goes to coding->cmp_data->data instead of
 274    `dst'.
 275
 276    This macro is used in decoding routines.  */
 277
 278 #define EMIT_CHAR(c)                                                    \
 279   do {                                                                  \
 280     if (! COMPOSING_P (coding)                                          \
 281         || coding->composing == COMPOSITION_RELATIVE                    \
 282         || coding->composing == COMPOSITION_WITH_RULE)                  \
 283       {                                                                 \
 284         int bytes = CHAR_BYTES (c);                                     \
 285         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 286           {                                                             \
 287             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 288             goto label_end_of_loop;                                     \
 289           }                                                             \
 290         dst += CHAR_STRING (c, dst);                                    \
 291         coding->produced_char++;                                        \
 292       }                                                                 \
 293                                                                         \
 294     if (COMPOSING_P (coding)                                            \
 295         && coding->composing != COMPOSITION_RELATIVE)                   \
 296       {                                                                 \
 297         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 298         coding->composition_rule_follows                                \
 299           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 300       }                                                                 \
 301   } while (0)
 302
 303
 304 #define EMIT_ONE_BYTE(c)                                        \
 305   do {                                                          \
 306     if (dst >= (dst_bytes ? dst_end : src))                     \
 307       {                                                         \
 308         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 309         goto label_end_of_loop;                                 \
 310       }                                                         \
 311     *dst++ = c;                                                 \
 312   } while (0)
 313
 314 #define EMIT_TWO_BYTES(c1, c2)                                  \
 315   do {                                                          \
 316     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 317       {                                                         \
 318         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 319         goto label_end_of_loop;                                 \
 320       }                                                         \
 321     *dst++ = c1, *dst++ = c2;                                   \
 322   } while (0)
 323
 324 #define EMIT_BYTES(from, to)                                    \
 325   do {                                                          \
 326     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 327       {                                                         \
 328         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 329         goto label_end_of_loop;                                 \
 330       }                                                         \
 331     while (from < to)                                           \
 332       *dst++ = *from++;                                         \
 333   } while (0)
 334
 335 \f
 336 /*** 1. Preamble ***/
 337
 338 #ifdef emacs
 339 #include <config.h>
 340 #endif
 341
 342 #include <stdio.h>
 343
 344 #ifdef emacs
 345
 346 #include "lisp.h"
 347 #include "buffer.h"
 348 #include "charset.h"
 349 #include "composite.h"
 350 #include "ccl.h"
 351 #include "coding.h"
 352 #include "window.h"
 353 #include "intervals.h"
 354
 355 #else  /* not emacs */
 356
 357 #include "mulelib.h"
 358
 359 #endif /* not emacs */
 360
 361 Lisp_Object Qcoding_system, Qeol_type;
 362 Lisp_Object Qbuffer_file_coding_system;
 363 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 364 Lisp_Object Qno_conversion, Qundecided;
 365 Lisp_Object Qcoding_system_history;
 366 Lisp_Object Qsafe_chars;
 367 Lisp_Object Qvalid_codes;
 368 Lisp_Object Qascii_incompatible;
 369
 370 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 371 Lisp_Object Qcall_process, Qcall_process_region;
 372 Lisp_Object Qstart_process, Qopen_network_stream;
 373 Lisp_Object Qtarget_idx;
 374
 375 extern Lisp_Object Qcompletion_ignore_case;
 376
 377 /* If a symbol has this property, evaluate the value to define the
 378    symbol as a coding system.  */
 379 Lisp_Object Qcoding_system_define_form;
 380
 381 Lisp_Object Vselect_safe_coding_system_function;
 382
 383 int coding_system_require_warning;
 384
 385 /* Mnemonic string for each format of end-of-line.  */
 386 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 387 /* Mnemonic string to indicate format of end-of-line is not yet
 388    decided.  */
 389 Lisp_Object eol_mnemonic_undecided;
 390
 391 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 392    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
 393    This has an effect only for external encoding (i.e. for output to
 394    file and process), not for in-buffer or Lisp string encoding.  */
 395 int system_eol_type;
 396
 397 #ifdef emacs
 398
 399 /* Information about which coding system is safe for which chars.
 400    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 401
 402    GENERIC-LIST is a list of generic coding systems which can encode
 403    any characters.
 404
 405    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 406    corresponding char table that contains safe chars.  */
 407 Lisp_Object Vcoding_system_safe_chars;
 408
 409 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 410
 411 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 412
 413 /* Coding system emacs-mule and raw-text are for converting only
 414    end-of-line format.  */
 415 Lisp_Object Qemacs_mule, Qraw_text;
 416
 417 Lisp_Object Qutf_8;
 418
 419 /* Coding-systems are handed between Emacs Lisp programs and C internal
 420    routines by the following three variables.  */
 421 /* Coding-system for reading files and receiving data from process.  */
 422 Lisp_Object Vcoding_system_for_read;
 423 /* Coding-system for writing files and sending data to process.  */
 424 Lisp_Object Vcoding_system_for_write;
 425 /* Coding-system actually used in the latest I/O.  */
 426 Lisp_Object Vlast_coding_system_used;
 427
 428 /* A vector of length 256 which contains information about special
 429    Latin codes (especially for dealing with Microsoft codes).  */
 430 Lisp_Object Vlatin_extra_code_table;
 431
 432 /* Flag to inhibit code conversion of end-of-line format.  */
 433 int inhibit_eol_conversion;
 434
 435 /* Flag to inhibit ISO2022 escape sequence detection.  */
 436 int inhibit_iso_escape_detection;
 437
 438 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 439 int inherit_process_coding_system;
 440
 441 /* Coding system to be used to encode text for terminal display.  */
 442 struct coding_system terminal_coding;
 443
 444 /* Coding system to be used to encode text for terminal display when
 445    terminal coding system is nil.  */
 446 struct coding_system safe_terminal_coding;
 447
 448 /* Coding system of what is sent from terminal keyboard.  */
 449 struct coding_system keyboard_coding;
 450
 451 /* Default coding system to be used to write a file.  */
 452 struct coding_system default_buffer_file_coding;
 453
 454 Lisp_Object Vfile_coding_system_alist;
 455 Lisp_Object Vprocess_coding_system_alist;
 456 Lisp_Object Vnetwork_coding_system_alist;
 457
 458 Lisp_Object Vlocale_coding_system;
 459
 460 #endif /* emacs */
 461
 462 Lisp_Object Qcoding_category, Qcoding_category_index;
 463
 464 /* List of symbols `coding-category-xxx' ordered by priority.  */
 465 Lisp_Object Vcoding_category_list;
 466
 467 /* Table of coding categories (Lisp symbols).  */
 468 Lisp_Object Vcoding_category_table;
 469
 470 /* Table of names of symbol for each coding-category.  */
 471 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 472   "coding-category-emacs-mule",
 473   "coding-category-sjis",
 474   "coding-category-iso-7",
 475   "coding-category-iso-7-tight",
 476   "coding-category-iso-8-1",
 477   "coding-category-iso-8-2",
 478   "coding-category-iso-7-else",
 479   "coding-category-iso-8-else",
 480   "coding-category-ccl",
 481   "coding-category-big5",
 482   "coding-category-utf-8",
 483   "coding-category-utf-16-be",
 484   "coding-category-utf-16-le",
 485   "coding-category-raw-text",
 486   "coding-category-binary"
 487 };
 488
 489 /* Table of pointers to coding systems corresponding to each coding
 490    categories.  */
 491 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 492
 493 /* Table of coding category masks.  Nth element is a mask for a coding
 494    category of which priority is Nth.  */
 495 static
 496 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 497
 498 /* Flag to tell if we look up translation table on character code
 499    conversion.  */
 500 Lisp_Object Venable_character_translation;
 501 /* Standard translation table to look up on decoding (reading).  */
 502 Lisp_Object Vstandard_translation_table_for_decode;
 503 /* Standard translation table to look up on encoding (writing).  */
 504 Lisp_Object Vstandard_translation_table_for_encode;
 505
 506 Lisp_Object Qtranslation_table;
 507 Lisp_Object Qtranslation_table_id;
 508 Lisp_Object Qtranslation_table_for_decode;
 509 Lisp_Object Qtranslation_table_for_encode;
 510
 511 /* Alist of charsets vs revision number.  */
 512 Lisp_Object Vcharset_revision_alist;
 513
 514 /* Default coding systems used for process I/O.  */
 515 Lisp_Object Vdefault_process_coding_system;
 516
 517 /* Char table for translating Quail and self-inserting input.  */
 518 Lisp_Object Vtranslation_table_for_input;
 519
 520 /* Global flag to tell that we can't call post-read-conversion and
 521    pre-write-conversion functions.  Usually the value is zero, but it
 522    is set to 1 temporarily while such functions are running.  This is
 523    to avoid infinite recursive call.  */
 524 static int inhibit_pre_post_conversion;
 525
 526 Lisp_Object Qchar_coding_system;
 527
 528 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 529    its validity.  */
 530
 531 Lisp_Object
 532 coding_safe_chars (coding_system)
 533      Lisp_Object coding_system;
 534 {
 535   Lisp_Object coding_spec, plist, safe_chars;
 536
 537   coding_spec = Fget (coding_system, Qcoding_system);
 538   plist = XVECTOR (coding_spec)->contents[3];
 539   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 540   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 541 }
 542
 543 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 544   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 545
 546 \f
 547 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 548
 549 /* Emacs' internal format for representation of multiple character
 550    sets is a kind of multi-byte encoding, i.e. characters are
 551    represented by variable-length sequences of one-byte codes.
 552
 553    ASCII characters and control characters (e.g. `tab', `newline') are
 554    represented by one-byte sequences which are their ASCII codes, in
 555    the range 0x00 through 0x7F.
 556
 557    8-bit characters of the range 0x80..0x9F are represented by
 558    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 559    code + 0x20).
 560
 561    8-bit characters of the range 0xA0..0xFF are represented by
 562    one-byte sequences which are their 8-bit code.
 563
 564    The other characters are represented by a sequence of `base
 565    leading-code', optional `extended leading-code', and one or two
 566    `position-code's.  The length of the sequence is determined by the
 567    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 568    whereas extended leading-code and position-code take the range 0xA0
 569    through 0xFF.  See `charset.h' for more details about leading-code
 570    and position-code.
 571
 572    --- CODE RANGE of Emacs' internal format ---
 573    character set        range
 574    -------------        -----
 575    ascii                0x00..0x7F
 576    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 577    eight-bit-graphic    0xA0..0xBF
 578    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 579    ---------------------------------------------
 580
 581    As this is the internal character representation, the format is
 582    usually not used externally (i.e. in a file or in a data sent to a
 583    process).  But, it is possible to have a text externally in this
 584    format (i.e. by encoding by the coding system `emacs-mule').
 585
 586    In that case, a sequence of one-byte codes has a slightly different
 587    form.
 588
 589    Firstly, all characters in eight-bit-control are represented by
 590    one-byte sequences which are their 8-bit code.
 591
 592    Next, character composition data are represented by the byte
 593    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 594    where,
 595         METHOD is 0xF0 plus one of composition method (enum
 596         composition_method),
 597
 598         BYTES is 0xA0 plus the byte length of these composition data,
 599
 600         CHARS is 0xA0 plus the number of characters composed by these
 601         data,
 602
 603         COMPONENTs are characters of multibyte form or composition
 604         rules encoded by two-byte of ASCII codes.
 605
 606    In addition, for backward compatibility, the following formats are
 607    also recognized as composition data on decoding.
 608
 609    0x80 MSEQ ...
 610    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 611
 612    Here,
 613         MSEQ is a multibyte form but in these special format:
 614           ASCII: 0xA0 ASCII_CODE+0x80,
 615           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 616         RULE is a one byte code of the range 0xA0..0xF0 that
 617         represents a composition rule.
 618   */
 619
 620 enum emacs_code_class_type emacs_code_class[256];
 621
 622 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 623    Check if a text is encoded in Emacs' internal format.  If it is,
 624    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 625
 626 static int
 627 detect_coding_emacs_mule (src, src_end, multibytep)
 628       unsigned char *src, *src_end;
 629       int multibytep;
 630 {
 631   unsigned char c;
 632   int composing = 0;
 633   /* Dummy for ONE_MORE_BYTE.  */
 634   struct coding_system dummy_coding;
 635   struct coding_system *coding = &dummy_coding;
 636
 637   while (1)
 638     {
 639       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
 640                                      CODING_CATEGORY_MASK_EMACS_MULE);
 641       if (composing)
 642         {
 643           if (c < 0xA0)
 644             composing = 0;
 645           else if (c == 0xA0)
 646             {
 647               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
 648               c &= 0x7F;
 649             }
 650           else
 651             c -= 0x20;
 652         }
 653
 654       if (c < 0x20)
 655         {
 656           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 657             return 0;
 658         }
 659       else if (c >= 0x80 && c < 0xA0)
 660         {
 661           if (c == 0x80)
 662             /* Old leading code for a composite character.  */
 663             composing = 1;
 664           else
 665             {
 666               unsigned char *src_base = src - 1;
 667               int bytes;
 668
 669               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 670                                                bytes))
 671                 return 0;
 672               src = src_base + bytes;
 673             }
 674         }
 675     }
 676 }
 677
 678
 679 /* Record the starting position START and METHOD of one composition.  */
 680
 681 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 682   do {                                                          \
 683     struct composition_data *cmp_data = coding->cmp_data;       \
 684     int *data = cmp_data->data + cmp_data->used;                \
 685     coding->cmp_data_start = cmp_data->used;                    \
 686     data[0] = -1;                                               \
 687     data[1] = cmp_data->char_offset + start;                    \
 688     data[3] = (int) method;                                     \
 689     cmp_data->used += 4;                                        \
 690   } while (0)
 691
 692 /* Record the ending position END of the current composition.  */
 693
 694 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 695   do {                                                          \
 696     struct composition_data *cmp_data = coding->cmp_data;       \
 697     int *data = cmp_data->data + coding->cmp_data_start;        \
 698     data[0] = cmp_data->used - coding->cmp_data_start;          \
 699     data[2] = cmp_data->char_offset + end;                      \
 700   } while (0)
 701
 702 /* Record one COMPONENT (alternate character or composition rule).  */
 703
 704 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 705   do {                                                                  \
 706     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 707     if (coding->cmp_data->used - coding->cmp_data_start                 \
 708         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 709       {                                                                 \
 710         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 711         coding->composing = COMPOSITION_NO;                             \
 712       }                                                                 \
 713   } while (0)
 714
 715
 716 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 717    is not less than SRC_END, return -1 without incrementing Src.  */
 718
 719 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 720
 721
 722 /* Decode a character represented as a component of composition
 723    sequence of Emacs 20 style at SRC.  Set C to that character, store
 724    its multibyte form sequence at P, and set P to the end of that
 725    sequence.  If no valid character is found, set C to -1.  */
 726
 727 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 728   do {                                                          \
 729     int bytes;                                                  \
 730                                                                 \
 731     c = SAFE_ONE_MORE_BYTE ();                                  \
 732     if (c < 0)                                                  \
 733       break;                                                    \
 734     if (CHAR_HEAD_P (c))                                        \
 735       c = -1;                                                   \
 736     else if (c == 0xA0)                                         \
 737       {                                                         \
 738         c = SAFE_ONE_MORE_BYTE ();                              \
 739         if (c < 0xA0)                                           \
 740           c = -1;                                               \
 741         else                                                    \
 742           {                                                     \
 743             c -= 0x80;                                          \
 744             *p++ = c;                                           \
 745           }                                                     \
 746       }                                                         \
 747     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 748       {                                                         \
 749         unsigned char *p0 = p;                                  \
 750                                                                 \
 751         c -= 0x20;                                              \
 752         *p++ = c;                                               \
 753         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 754         while (--bytes)                                         \
 755           {                                                     \
 756             c = SAFE_ONE_MORE_BYTE ();                          \
 757             if (c < 0)                                          \
 758               break;                                            \
 759             *p++ = c;                                           \
 760           }                                                     \
 761         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 762             || (coding->flags /* We are recovering a file.  */  \
 763                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 764                 && ! CHAR_HEAD_P (p0[1])))                      \
 765           c = STRING_CHAR (p0, bytes);                          \
 766         else                                                    \
 767           c = -1;                                               \
 768       }                                                         \
 769     else                                                        \
 770       c = -1;                                                   \
 771   } while (0)
 772
 773
 774 /* Decode a composition rule represented as a component of composition
 775    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 776    valid rule is found, set C to -1.  */
 777
 778 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 779   do {                                                  \
 780     c = SAFE_ONE_MORE_BYTE ();                          \
 781     c -= 0xA0;                                          \
 782     if (c < 0 || c >= 81)                               \
 783       c = -1;                                           \
 784     else                                                \
 785       {                                                 \
 786         gref = c / 9, nref = c % 9;                     \
 787         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 788       }                                                 \
 789   } while (0)
 790
 791
 792 /* Decode composition sequence encoded by `emacs-mule' at the source
 793    pointed by SRC.  SRC_END is the end of source.  Store information
 794    of the composition in CODING->cmp_data.
 795
 796    For backward compatibility, decode also a composition sequence of
 797    Emacs 20 style.  In that case, the composition sequence contains
 798    characters that should be extracted into a buffer or string.  Store
 799    those characters at *DESTINATION in multibyte form.
 800
 801    If we encounter an invalid byte sequence, return 0.
 802    If we encounter an insufficient source or destination, or
 803    insufficient space in CODING->cmp_data, return 1.
 804    Otherwise, return consumed bytes in the source.
 805
 806 */
 807 static INLINE int
 808 decode_composition_emacs_mule (coding, src, src_end,
 809                                destination, dst_end, dst_bytes)
 810      struct coding_system *coding;
 811      const unsigned char *src, *src_end;
 812      unsigned char **destination, *dst_end;
 813      int dst_bytes;
 814 {
 815   unsigned char *dst = *destination;
 816   int method, data_len, nchars;
 817   const unsigned char *src_base = src++;
 818   /* Store components of composition.  */
 819   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 820   int ncomponent;
 821   /* Store multibyte form of characters to be composed.  This is for
 822      Emacs 20 style composition sequence.  */
 823   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 824   unsigned char *bufp = buf;
 825   int c, i, gref, nref;
 826
 827   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 828       >= COMPOSITION_DATA_SIZE)
 829     {
 830       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 831       return -1;
 832     }
 833
 834   ONE_MORE_BYTE (c);
 835   if (c - 0xF0 >= COMPOSITION_RELATIVE
 836            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 837     {
 838       int with_rule;
 839
 840       method = c - 0xF0;
 841       with_rule = (method == COMPOSITION_WITH_RULE
 842                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 843       ONE_MORE_BYTE (c);
 844       data_len = c - 0xA0;
 845       if (data_len < 4
 846           || src_base + data_len > src_end)
 847         return 0;
 848       ONE_MORE_BYTE (c);
 849       nchars = c - 0xA0;
 850       if (c < 1)
 851         return 0;
 852       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 853         {
 854           /* If it is longer than this, it can't be valid.  */
 855           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 856             return 0;
 857
 858           if (ncomponent % 2 && with_rule)
 859             {
 860               ONE_MORE_BYTE (gref);
 861               gref -= 32;
 862               ONE_MORE_BYTE (nref);
 863               nref -= 32;
 864               c = COMPOSITION_ENCODE_RULE (gref, nref);
 865             }
 866           else
 867             {
 868               int bytes;
 869               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 870                   || (coding->flags /* We are recovering a file.  */
 871                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 872                       && ! CHAR_HEAD_P (src[1])))
 873                 c = STRING_CHAR (src, bytes);
 874               else
 875                 c = *src, bytes = 1;
 876               src += bytes;
 877             }
 878           component[ncomponent] = c;
 879         }
 880     }
 881   else if (c >= 0x80)
 882     {
 883       /* This may be an old Emacs 20 style format.  See the comment at
 884          the section 2 of this file.  */
 885       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 886       if (src == src_end
 887           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 888         goto label_end_of_loop;
 889
 890       src_end = src;
 891       src = src_base + 1;
 892       if (c < 0xC0)
 893         {
 894           method = COMPOSITION_RELATIVE;
 895           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 896             {
 897               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 898               if (c < 0)
 899                 break;
 900               component[ncomponent++] = c;
 901             }
 902           if (ncomponent < 2)
 903             return 0;
 904           nchars = ncomponent;
 905         }
 906       else if (c == 0xFF)
 907         {
 908           method = COMPOSITION_WITH_RULE;
 909           src++;
 910           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 911           if (c < 0)
 912             return 0;
 913           component[0] = c;
 914           for (ncomponent = 1;
 915                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 916             {
 917               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 918               if (c < 0)
 919                 break;
 920               component[ncomponent++] = c;
 921               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 922               if (c < 0)
 923                 break;
 924               component[ncomponent++] = c;
 925             }
 926           if (ncomponent < 3)
 927             return 0;
 928           nchars = (ncomponent + 1) / 2;
 929         }
 930       else
 931         return 0;
 932     }
 933   else
 934     return 0;
 935
 936   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 937     {
 938       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 939       for (i = 0; i < ncomponent; i++)
 940         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 941       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 942       if (buf < bufp)
 943         {
 944           unsigned char *p = buf;
 945           EMIT_BYTES (p, bufp);
 946           *destination += bufp - buf;
 947           coding->produced_char += nchars;
 948         }
 949       return (src - src_base);
 950     }
 951  label_end_of_loop:
 952   return -1;
 953 }
 954
 955 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 956
 957 static void
 958 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 959      struct coding_system *coding;
 960      const unsigned char *source;
 961      unsigned char *destination;
 962      int src_bytes, dst_bytes;
 963 {
 964   const unsigned char *src = source;
 965   const unsigned char *src_end = source + src_bytes;
 966   unsigned char *dst = destination;
 967   unsigned char *dst_end = destination + dst_bytes;
 968   /* SRC_BASE remembers the start position in source in each loop.
 969      The loop will be exited when there's not enough source code, or
 970      when there's not enough destination area to produce a
 971      character.  */
 972   const unsigned char *src_base;
 973
 974   coding->produced_char = 0;
 975   while ((src_base = src) < src_end)
 976     {
 977       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 978       const unsigned char *p;
 979       int bytes;
 980
 981       if (*src == '\r')
 982         {
 983           int c = *src++;
 984
 985           if (coding->eol_type == CODING_EOL_CR)
 986             c = '\n';
 987           else if (coding->eol_type == CODING_EOL_CRLF)
 988             {
 989               ONE_MORE_BYTE (c);
 990               if (c != '\n')
 991                 {
 992                   src--;
 993                   c = '\r';
 994                 }
 995             }
 996           *dst++ = c;
 997           coding->produced_char++;
 998           continue;
 999         }
1000       else if (*src == '\n')
1001         {
1002           if ((coding->eol_type == CODING_EOL_CR
1003                || coding->eol_type == CODING_EOL_CRLF)
1004               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1005             {
1006               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1007               goto label_end_of_loop;
1008             }
1009           *dst++ = *src++;
1010           coding->produced_char++;
1011           continue;
1012         }
1013       else if (*src == 0x80 && coding->cmp_data)
1014         {
1015           /* Start of composition data.  */
1016           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1017                                                          &dst, dst_end,
1018                                                          dst_bytes);
1019           if (consumed < 0)
1020             goto label_end_of_loop;
1021           else if (consumed > 0)
1022             {
1023               src += consumed;
1024               continue;
1025             }
1026           bytes = CHAR_STRING (*src, tmp);
1027           p = tmp;
1028           src++;
1029         }
1030       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1031                || (coding->flags /* We are recovering a file.  */
1032                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1033                    && ! CHAR_HEAD_P (src[1])))
1034         {
1035           p = src;
1036           src += bytes;
1037         }
1038       else
1039         {
1040           int i, c;
1041
1042           bytes = BYTES_BY_CHAR_HEAD (*src);
1043           src++;
1044           for (i = 1; i < bytes; i++)
1045             {
1046               ONE_MORE_BYTE (c);
1047               if (CHAR_HEAD_P (c))
1048                 break;
1049             }
1050           if (i < bytes)
1051             {
1052               bytes = CHAR_STRING (*src_base, tmp);
1053               p = tmp;
1054               src = src_base + 1;
1055             }
1056           else
1057             {
1058               p = src_base;
1059             }
1060         }
1061       if (dst + bytes >= (dst_bytes ? dst_end : src))
1062         {
1063           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1064           break;
1065         }
1066       while (bytes--) *dst++ = *p++;
1067       coding->produced_char++;
1068     }
1069  label_end_of_loop:
1070   coding->consumed = coding->consumed_char = src_base - source;
1071   coding->produced = dst - destination;
1072 }
1073
1074
1075 /* Encode composition data stored at DATA into a special byte sequence
1076    starting by 0x80.  Update CODING->cmp_data_start and maybe
1077    CODING->cmp_data for the next call.  */
1078
1079 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1080   do {                                                                  \
1081     unsigned char buf[1024], *p0 = buf, *p;                             \
1082     int len = data[0];                                                  \
1083     int i;                                                              \
1084                                                                         \
1085     buf[0] = 0x80;                                                      \
1086     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1087     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1088     p = buf + 4;                                                        \
1089     if (data[3] == COMPOSITION_WITH_RULE                                \
1090         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1091       {                                                                 \
1092         p += CHAR_STRING (data[4], p);                                  \
1093         for (i = 5; i < len; i += 2)                                    \
1094           {                                                             \
1095             int gref, nref;                                             \
1096              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1097             *p++ = 0x20 + gref;                                         \
1098             *p++ = 0x20 + nref;                                         \
1099             p += CHAR_STRING (data[i + 1], p);                          \
1100           }                                                             \
1101       }                                                                 \
1102     else                                                                \
1103       {                                                                 \
1104         for (i = 4; i < len; i++)                                       \
1105           p += CHAR_STRING (data[i], p);                                \
1106       }                                                                 \
1107     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1108                                                                         \
1109     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1110       {                                                                 \
1111         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1112         goto label_end_of_loop;                                         \
1113       }                                                                 \
1114     while (p0 < p)                                                      \
1115       *dst++ = *p0++;                                                   \
1116     coding->cmp_data_start += data[0];                                  \
1117     if (coding->cmp_data_start == coding->cmp_data->used                \
1118         && coding->cmp_data->next)                                      \
1119       {                                                                 \
1120         coding->cmp_data = coding->cmp_data->next;                      \
1121         coding->cmp_data_start = 0;                                     \
1122       }                                                                 \
1123   } while (0)
1124
1125
1126 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1127                             unsigned char *, int, int));
1128
1129 static void
1130 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1131      struct coding_system *coding;
1132      const unsigned char *source;
1133      unsigned char *destination;
1134      int src_bytes, dst_bytes;
1135 {
1136   const unsigned char *src = source;
1137   const unsigned char *src_end = source + src_bytes;
1138   unsigned char *dst = destination;
1139   unsigned char *dst_end = destination + dst_bytes;
1140   const unsigned char *src_base;
1141   int c;
1142   int char_offset;
1143   int *data;
1144
1145   Lisp_Object translation_table;
1146
1147   translation_table = Qnil;
1148
1149   /* Optimization for the case that there's no composition.  */
1150   if (!coding->cmp_data || coding->cmp_data->used == 0)
1151     {
1152       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1153       return;
1154     }
1155
1156   char_offset = coding->cmp_data->char_offset;
1157   data = coding->cmp_data->data + coding->cmp_data_start;
1158   while (1)
1159     {
1160       src_base = src;
1161
1162       /* If SRC starts a composition, encode the information about the
1163          composition in advance.  */
1164       if (coding->cmp_data_start < coding->cmp_data->used
1165           && char_offset + coding->consumed_char == data[1])
1166         {
1167           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1168           char_offset = coding->cmp_data->char_offset;
1169           data = coding->cmp_data->data + coding->cmp_data_start;
1170         }
1171
1172       ONE_MORE_CHAR (c);
1173       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1174                         || coding->eol_type == CODING_EOL_CR))
1175         {
1176           if (coding->eol_type == CODING_EOL_CRLF)
1177             EMIT_TWO_BYTES ('\r', c);
1178           else
1179             EMIT_ONE_BYTE ('\r');
1180         }
1181       else if (SINGLE_BYTE_CHAR_P (c))
1182         {
1183           if (coding->flags && ! ASCII_BYTE_P (c))
1184             {
1185               /* As we are auto saving, retain the multibyte form for
1186                  8-bit chars.  */
1187               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1188               int bytes = CHAR_STRING (c, buf);
1189
1190               if (bytes == 1)
1191                 EMIT_ONE_BYTE (buf[0]);
1192               else
1193                 EMIT_TWO_BYTES (buf[0], buf[1]);
1194             }
1195           else
1196             EMIT_ONE_BYTE (c);
1197         }
1198       else
1199         EMIT_BYTES (src_base, src);
1200       coding->consumed_char++;
1201     }
1202  label_end_of_loop:
1203   coding->consumed = src_base - source;
1204   coding->produced = coding->produced_char = dst - destination;
1205   return;
1206 }
1207
1208 \f
1209 /*** 3. ISO2022 handlers ***/
1210
1211 /* The following note describes the coding system ISO2022 briefly.
1212    Since the intention of this note is to help understand the
1213    functions in this file, some parts are NOT ACCURATE or are OVERLY
1214    SIMPLIFIED.  For thorough understanding, please refer to the
1215    original document of ISO2022.  This is equivalent to the standard
1216    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1217
1218    ISO2022 provides many mechanisms to encode several character sets
1219    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1220    is encoded using bytes less than 128.  This may make the encoded
1221    text a little bit longer, but the text passes more easily through
1222    several types of gateway, some of which strip off the MSB (Most
1223    Significant Bit).
1224
1225    There are two kinds of character sets: control character sets and
1226    graphic character sets.  The former contain control characters such
1227    as `newline' and `escape' to provide control functions (control
1228    functions are also provided by escape sequences).  The latter
1229    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1230    two control character sets and many graphic character sets.
1231
1232    Graphic character sets are classified into one of the following
1233    four classes, according to the number of bytes (DIMENSION) and
1234    number of characters in one dimension (CHARS) of the set:
1235    - DIMENSION1_CHARS94
1236    - DIMENSION1_CHARS96
1237    - DIMENSION2_CHARS94
1238    - DIMENSION2_CHARS96
1239
1240    In addition, each character set is assigned an identification tag,
1241    unique for each set, called the "final character" (denoted as <F>
1242    hereafter).  The <F> of each character set is decided by ECMA(*)
1243    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1244    (0x30..0x3F are for private use only).
1245
1246    Note (*): ECMA = European Computer Manufacturers Association
1247
1248    Here are examples of graphic character sets [NAME(<F>)]:
1249         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1250         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1251         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1252         o DIMENSION2_CHARS96 -- none for the moment
1253
1254    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1255         C0 [0x00..0x1F] -- control character plane 0
1256         GL [0x20..0x7F] -- graphic character plane 0
1257         C1 [0x80..0x9F] -- control character plane 1
1258         GR [0xA0..0xFF] -- graphic character plane 1
1259
1260    A control character set is directly designated and invoked to C0 or
1261    C1 by an escape sequence.  The most common case is that:
1262    - ISO646's  control character set is designated/invoked to C0, and
1263    - ISO6429's control character set is designated/invoked to C1,
1264    and usually these designations/invocations are omitted in encoded
1265    text.  In a 7-bit environment, only C0 can be used, and a control
1266    character for C1 is encoded by an appropriate escape sequence to
1267    fit into the environment.  All control characters for C1 are
1268    defined to have corresponding escape sequences.
1269
1270    A graphic character set is at first designated to one of four
1271    graphic registers (G0 through G3), then these graphic registers are
1272    invoked to GL or GR.  These designations and invocations can be
1273    done independently.  The most common case is that G0 is invoked to
1274    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1275    these invocations and designations are omitted in encoded text.
1276    In a 7-bit environment, only GL can be used.
1277
1278    When a graphic character set of CHARS94 is invoked to GL, codes
1279    0x20 and 0x7F of the GL area work as control characters SPACE and
1280    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1281    be used.
1282
1283    There are two ways of invocation: locking-shift and single-shift.
1284    With locking-shift, the invocation lasts until the next different
1285    invocation, whereas with single-shift, the invocation affects the
1286    following character only and doesn't affect the locking-shift
1287    state.  Invocations are done by the following control characters or
1288    escape sequences:
1289
1290    ----------------------------------------------------------------------
1291    abbrev  function                  cntrl escape seq   description
1292    ----------------------------------------------------------------------
1293    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1294    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1295    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1296    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1297    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1298    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1299    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1300    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1301    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1302    ----------------------------------------------------------------------
1303    (*) These are not used by any known coding system.
1304
1305    Control characters for these functions are defined by macros
1306    ISO_CODE_XXX in `coding.h'.
1307
1308    Designations are done by the following escape sequences:
1309    ----------------------------------------------------------------------
1310    escape sequence      description
1311    ----------------------------------------------------------------------
1312    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1313    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1314    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1315    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1316    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1317    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1318    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1319    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1320    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1321    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1322    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1323    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1324    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1325    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1326    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1327    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1328    ----------------------------------------------------------------------
1329
1330    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1331    of dimension 1, chars 94, and final character <F>, etc...
1332
1333    Note (*): Although these designations are not allowed in ISO2022,
1334    Emacs accepts them on decoding, and produces them on encoding
1335    CHARS96 character sets in a coding system which is characterized as
1336    7-bit environment, non-locking-shift, and non-single-shift.
1337
1338    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1339    '(' can be omitted.  We refer to this as "short-form" hereafter.
1340
1341    Now you may notice that there are a lot of ways of encoding the
1342    same multilingual text in ISO2022.  Actually, there exist many
1343    coding systems such as Compound Text (used in X11's inter client
1344    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1345    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1346    localized platforms), and all of these are variants of ISO2022.
1347
1348    In addition to the above, Emacs handles two more kinds of escape
1349    sequences: ISO6429's direction specification and Emacs' private
1350    sequence for specifying character composition.
1351
1352    ISO6429's direction specification takes the following form:
1353         o CSI ']'      -- end of the current direction
1354         o CSI '0' ']'  -- end of the current direction
1355         o CSI '1' ']'  -- start of left-to-right text
1356         o CSI '2' ']'  -- start of right-to-left text
1357    The control character CSI (0x9B: control sequence introducer) is
1358    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1359
1360    Character composition specification takes the following form:
1361         o ESC '0' -- start relative composition
1362         o ESC '1' -- end composition
1363         o ESC '2' -- start rule-base composition (*)
1364         o ESC '3' -- start relative composition with alternate chars  (**)
1365         o ESC '4' -- start rule-base composition with alternate chars  (**)
1366   Since these are not standard escape sequences of any ISO standard,
1367   the use of them with these meanings is restricted to Emacs only.
1368
1369   (*) This form is used only in Emacs 20.5 and older versions,
1370   but the newer versions can safely decode it.
1371   (**) This form is used only in Emacs 21.1 and newer versions,
1372   and the older versions can't decode it.
1373
1374   Here's a list of example usages of these composition escape
1375   sequences (categorized by `enum composition_method').
1376
1377   COMPOSITION_RELATIVE:
1378         ESC 0 CHAR [ CHAR ] ESC 1
1379   COMPOSITION_WITH_RULE:
1380         ESC 2 CHAR [ RULE CHAR ] ESC 1
1381   COMPOSITION_WITH_ALTCHARS:
1382         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1383   COMPOSITION_WITH_RULE_ALTCHARS:
1384         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1385
1386 enum iso_code_class_type iso_code_class[256];
1387
1388 #define CHARSET_OK(idx, charset, c)                                     \
1389   (coding_system_table[idx]                                             \
1390    && (charset == CHARSET_ASCII                                         \
1391        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1392            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1393    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1394                                               charset)                  \
1395        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1396
1397 #define SHIFT_OUT_OK(idx) \
1398   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1399
1400 #define COMPOSITION_OK(idx)     \
1401   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1402
1403 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1404    Check if a text is encoded in ISO2022.  If it is, return an
1405    integer in which appropriate flag bits any of:
1406         CODING_CATEGORY_MASK_ISO_7
1407         CODING_CATEGORY_MASK_ISO_7_TIGHT
1408         CODING_CATEGORY_MASK_ISO_8_1
1409         CODING_CATEGORY_MASK_ISO_8_2
1410         CODING_CATEGORY_MASK_ISO_7_ELSE
1411         CODING_CATEGORY_MASK_ISO_8_ELSE
1412    are set.  If a code which should never appear in ISO2022 is found,
1413    returns 0.  */
1414
1415 static int
1416 detect_coding_iso2022 (src, src_end, multibytep)
1417      unsigned char *src, *src_end;
1418      int multibytep;
1419 {
1420   int mask = CODING_CATEGORY_MASK_ISO;
1421   int mask_found = 0;
1422   int reg[4], shift_out = 0, single_shifting = 0;
1423   int c, c1, charset;
1424   /* Dummy for ONE_MORE_BYTE.  */
1425   struct coding_system dummy_coding;
1426   struct coding_system *coding = &dummy_coding;
1427   Lisp_Object safe_chars;
1428
1429   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1430   while (mask)
1431     {
1432       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1433     retry:
1434       switch (c)
1435         {
1436         case ISO_CODE_ESC:
1437           if (inhibit_iso_escape_detection)
1438             break;
1439           single_shifting = 0;
1440           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1441           if (c >= '(' && c <= '/')
1442             {
1443               /* Designation sequence for a charset of dimension 1.  */
1444               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1445               if (c1 < ' ' || c1 >= 0x80
1446                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1447                 /* Invalid designation sequence.  Just ignore.  */
1448                 break;
1449               reg[(c - '(') % 4] = charset;
1450             }
1451           else if (c == '$')
1452             {
1453               /* Designation sequence for a charset of dimension 2.  */
1454               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1455               if (c >= '@' && c <= 'B')
1456                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1457                 reg[0] = charset = iso_charset_table[1][0][c];
1458               else if (c >= '(' && c <= '/')
1459                 {
1460                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1461                                                  mask & mask_found);
1462                   if (c1 < ' ' || c1 >= 0x80
1463                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1464                     /* Invalid designation sequence.  Just ignore.  */
1465                     break;
1466                   reg[(c - '(') % 4] = charset;
1467                 }
1468               else
1469                 /* Invalid designation sequence.  Just ignore.  */
1470                 break;
1471             }
1472           else if (c == 'N' || c == 'O')
1473             {
1474               /* ESC <Fe> for SS2 or SS3.  */
1475               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1476               break;
1477             }
1478           else if (c >= '0' && c <= '4')
1479             {
1480               /* ESC <Fp> for start/end composition.  */
1481               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1482                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1483               else
1484                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1485               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1486                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1487               else
1488                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1489               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1490                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1491               else
1492                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1493               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1494                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1495               else
1496                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1497               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1498                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1499               else
1500                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1501               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1502                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1503               else
1504                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1505               break;
1506             }
1507           else
1508             /* Invalid escape sequence.  Just ignore.  */
1509             break;
1510
1511           /* We found a valid designation sequence for CHARSET.  */
1512           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1513           c = MAKE_CHAR (charset, 0, 0);
1514           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1515             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1516           else
1517             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1518           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1519             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1520           else
1521             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1522           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1523             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1524           else
1525             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1526           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1527             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1528           else
1529             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1530           break;
1531
1532         case ISO_CODE_SO:
1533           if (inhibit_iso_escape_detection)
1534             break;
1535           single_shifting = 0;
1536           if (shift_out == 0
1537               && (reg[1] >= 0
1538                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1539                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1540             {
1541               /* Locking shift out.  */
1542               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1543               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1544             }
1545           break;
1546
1547         case ISO_CODE_SI:
1548           if (inhibit_iso_escape_detection)
1549             break;
1550           single_shifting = 0;
1551           if (shift_out == 1)
1552             {
1553               /* Locking shift in.  */
1554               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1555               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1556             }
1557           break;
1558
1559         case ISO_CODE_CSI:
1560           single_shifting = 0;
1561         case ISO_CODE_SS2:
1562         case ISO_CODE_SS3:
1563           {
1564             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1565
1566             if (inhibit_iso_escape_detection)
1567               break;
1568             if (c != ISO_CODE_CSI)
1569               {
1570                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1571                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1572                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1573                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1574                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1575                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1576                 single_shifting = 1;
1577               }
1578             if (VECTORP (Vlatin_extra_code_table)
1579                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1580               {
1581                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1582                     & CODING_FLAG_ISO_LATIN_EXTRA)
1583                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1584                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1585                     & CODING_FLAG_ISO_LATIN_EXTRA)
1586                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1587               }
1588             mask &= newmask;
1589             mask_found |= newmask;
1590           }
1591           break;
1592
1593         default:
1594           if (c < 0x80)
1595             {
1596               single_shifting = 0;
1597               break;
1598             }
1599           else if (c < 0xA0)
1600             {
1601               single_shifting = 0;
1602               if (VECTORP (Vlatin_extra_code_table)
1603                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1604                 {
1605                   int newmask = 0;
1606
1607                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1608                       & CODING_FLAG_ISO_LATIN_EXTRA)
1609                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1610                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1611                       & CODING_FLAG_ISO_LATIN_EXTRA)
1612                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1613                   mask &= newmask;
1614                   mask_found |= newmask;
1615                 }
1616               else
1617                 return 0;
1618             }
1619           else
1620             {
1621               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1622                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1623               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1624               /* Check the length of succeeding codes of the range
1625                  0xA0..0FF.  If the byte length is odd, we exclude
1626                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1627                  when we are not single shifting.  */
1628               if (!single_shifting
1629                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1630                 {
1631                   int i = 1;
1632
1633                   c = -1;
1634                   while (src < src_end)
1635                     {
1636                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1637                                                      mask & mask_found);
1638                       if (c < 0xA0)
1639                         break;
1640                       i++;
1641                     }
1642
1643                   if (i & 1 && src < src_end)
1644                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1645                   else
1646                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1647                   if (c >= 0)
1648                     /* This means that we have read one extra byte.  */
1649                     goto retry;
1650                 }
1651             }
1652           break;
1653         }
1654     }
1655   return (mask & mask_found);
1656 }
1657
1658 /* Decode a character of which charset is CHARSET, the 1st position
1659    code is C1, the 2nd position code is C2, and return the decoded
1660    character code.  If the variable `translation_table' is non-nil,
1661    returned the translated code.  */
1662
1663 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1664   (NILP (translation_table)                     \
1665    ? MAKE_CHAR (charset, c1, c2)                \
1666    : translate_char (translation_table, -1, charset, c1, c2))
1667
1668 /* Set designation state into CODING.  */
1669 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1670   do {                                                                     \
1671     int charset, c;                                                        \
1672                                                                            \
1673     if (final_char < '0' || final_char >= 128)                             \
1674       goto label_invalid_code;                                             \
1675     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1676                                  make_number (chars),                      \
1677                                  make_number (final_char));                \
1678     c = MAKE_CHAR (charset, 0, 0);                                         \
1679     if (charset >= 0                                                       \
1680         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1681             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1682       {                                                                    \
1683         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1684             && reg == 0                                                    \
1685             && charset == CHARSET_ASCII)                                   \
1686           {                                                                \
1687             /* We should insert this designation sequence as is so         \
1688                that it is surely written back to a file.  */               \
1689             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1690             goto label_invalid_code;                                       \
1691           }                                                                \
1692         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1693         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1694             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1695           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1696         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1697       }                                                                    \
1698     else                                                                   \
1699       {                                                                    \
1700         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1701         goto label_invalid_code;                                           \
1702       }                                                                    \
1703   } while (0)
1704
1705 /* Allocate a memory block for storing information about compositions.
1706    The block is chained to the already allocated blocks.  */
1707
1708 void
1709 coding_allocate_composition_data (coding, char_offset)
1710      struct coding_system *coding;
1711      int char_offset;
1712 {
1713   struct composition_data *cmp_data
1714     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1715
1716   cmp_data->char_offset = char_offset;
1717   cmp_data->used = 0;
1718   cmp_data->prev = coding->cmp_data;
1719   cmp_data->next = NULL;
1720   if (coding->cmp_data)
1721     coding->cmp_data->next = cmp_data;
1722   coding->cmp_data = cmp_data;
1723   coding->cmp_data_start = 0;
1724   coding->composing = COMPOSITION_NO;
1725 }
1726
1727 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1728    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1729    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1730    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1731    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1732   */
1733
1734 #define DECODE_COMPOSITION_START(c1)                                       \
1735   do {                                                                     \
1736     if (coding->composing == COMPOSITION_DISABLED)                         \
1737       {                                                                    \
1738         *dst++ = ISO_CODE_ESC;                                             \
1739         *dst++ = c1 & 0x7f;                                                \
1740         coding->produced_char += 2;                                        \
1741       }                                                                    \
1742     else if (!COMPOSING_P (coding))                                        \
1743       {                                                                    \
1744         /* This is surely the start of a composition.  We must be sure     \
1745            that coding->cmp_data has enough space to store the             \
1746            information about the composition.  If not, terminate the       \
1747            current decoding loop, allocate one more memory block for       \
1748            coding->cmp_data in the caller, then start the decoding         \
1749            loop again.  We can't allocate memory here directly because     \
1750            it may cause buffer/string relocation.  */                      \
1751         if (!coding->cmp_data                                              \
1752             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1753                 >= COMPOSITION_DATA_SIZE))                                 \
1754           {                                                                \
1755             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1756             goto label_end_of_loop;                                        \
1757           }                                                                \
1758         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1759                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1760                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1761                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1762         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1763                                       coding->composing);                  \
1764         coding->composition_rule_follows = 0;                              \
1765       }                                                                    \
1766     else                                                                   \
1767       {                                                                    \
1768         /* We are already handling a composition.  If the method is        \
1769            the following two, the codes following the current escape       \
1770            sequence are actual characters stored in a buffer.  */          \
1771         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1772             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1773           {                                                                \
1774             coding->composing = COMPOSITION_RELATIVE;                      \
1775             coding->composition_rule_follows = 0;                          \
1776           }                                                                \
1777       }                                                                    \
1778   } while (0)
1779
1780 /* Handle composition end sequence ESC 1.  */
1781
1782 #define DECODE_COMPOSITION_END(c1)                                      \
1783   do {                                                                  \
1784     if (! COMPOSING_P (coding))                                         \
1785       {                                                                 \
1786         *dst++ = ISO_CODE_ESC;                                          \
1787         *dst++ = c1;                                                    \
1788         coding->produced_char += 2;                                     \
1789       }                                                                 \
1790     else                                                                \
1791       {                                                                 \
1792         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1793         coding->composing = COMPOSITION_NO;                             \
1794       }                                                                 \
1795   } while (0)
1796
1797 /* Decode a composition rule from the byte C1 (and maybe one more byte
1798    from SRC) and store one encoded composition rule in
1799    coding->cmp_data.  */
1800
1801 #define DECODE_COMPOSITION_RULE(c1)                                     \
1802   do {                                                                  \
1803     int rule = 0;                                                       \
1804     (c1) -= 32;                                                         \
1805     if (c1 < 81)                /* old format (before ver.21) */        \
1806       {                                                                 \
1807         int gref = (c1) / 9;                                            \
1808         int nref = (c1) % 9;                                            \
1809         if (gref == 4) gref = 10;                                       \
1810         if (nref == 4) nref = 10;                                       \
1811         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1812       }                                                                 \
1813     else if (c1 < 93)           /* new format (after ver.21) */         \
1814       {                                                                 \
1815         ONE_MORE_BYTE (c2);                                             \
1816         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1817       }                                                                 \
1818     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1819     coding->composition_rule_follows = 0;                               \
1820   } while (0)
1821
1822
1823 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1824
1825 static void
1826 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1827      struct coding_system *coding;
1828      const unsigned char *source;
1829      unsigned char *destination;
1830      int src_bytes, dst_bytes;
1831 {
1832   const unsigned char *src = source;
1833   const unsigned char *src_end = source + src_bytes;
1834   unsigned char *dst = destination;
1835   unsigned char *dst_end = destination + dst_bytes;
1836   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1837   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1838   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1839   /* SRC_BASE remembers the start position in source in each loop.
1840      The loop will be exited when there's not enough source code
1841      (within macro ONE_MORE_BYTE), or when there's not enough
1842      destination area to produce a character (within macro
1843      EMIT_CHAR).  */
1844   const unsigned char *src_base;
1845   int c, charset;
1846   Lisp_Object translation_table;
1847   Lisp_Object safe_chars;
1848
1849   safe_chars = coding_safe_chars (coding->symbol);
1850
1851   if (NILP (Venable_character_translation))
1852     translation_table = Qnil;
1853   else
1854     {
1855       translation_table = coding->translation_table_for_decode;
1856       if (NILP (translation_table))
1857         translation_table = Vstandard_translation_table_for_decode;
1858     }
1859
1860   coding->result = CODING_FINISH_NORMAL;
1861
1862   while (1)
1863     {
1864       int c1, c2 = 0;
1865
1866       src_base = src;
1867       ONE_MORE_BYTE (c1);
1868
1869       /* We produce no character or one character.  */
1870       switch (iso_code_class [c1])
1871         {
1872         case ISO_0x20_or_0x7F:
1873           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1874             {
1875               DECODE_COMPOSITION_RULE (c1);
1876               continue;
1877             }
1878           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1879             {
1880               /* This is SPACE or DEL.  */
1881               charset = CHARSET_ASCII;
1882               break;
1883             }
1884           /* This is a graphic character, we fall down ...  */
1885
1886         case ISO_graphic_plane_0:
1887           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1888             {
1889               DECODE_COMPOSITION_RULE (c1);
1890               continue;
1891             }
1892           charset = charset0;
1893           break;
1894
1895         case ISO_0xA0_or_0xFF:
1896           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1897               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1898             goto label_invalid_code;
1899           /* This is a graphic character, we fall down ... */
1900
1901         case ISO_graphic_plane_1:
1902           if (charset1 < 0)
1903             goto label_invalid_code;
1904           charset = charset1;
1905           break;
1906
1907         case ISO_control_0:
1908           if (COMPOSING_P (coding))
1909             DECODE_COMPOSITION_END ('1');
1910
1911           /* All ISO2022 control characters in this class have the
1912              same representation in Emacs internal format.  */
1913           if (c1 == '\n'
1914               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1915               && (coding->eol_type == CODING_EOL_CR
1916                   || coding->eol_type == CODING_EOL_CRLF))
1917             {
1918               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1919               goto label_end_of_loop;
1920             }
1921           charset = CHARSET_ASCII;
1922           break;
1923
1924         case ISO_control_1:
1925           if (COMPOSING_P (coding))
1926             DECODE_COMPOSITION_END ('1');
1927           goto label_invalid_code;
1928
1929         case ISO_carriage_return:
1930           if (COMPOSING_P (coding))
1931             DECODE_COMPOSITION_END ('1');
1932
1933           if (coding->eol_type == CODING_EOL_CR)
1934             c1 = '\n';
1935           else if (coding->eol_type == CODING_EOL_CRLF)
1936             {
1937               ONE_MORE_BYTE (c1);
1938               if (c1 != ISO_CODE_LF)
1939                 {
1940                   src--;
1941                   c1 = '\r';
1942                 }
1943             }
1944           charset = CHARSET_ASCII;
1945           break;
1946
1947         case ISO_shift_out:
1948           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1949               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1950             goto label_invalid_code;
1951           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1952           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1953           continue;
1954
1955         case ISO_shift_in:
1956           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1957             goto label_invalid_code;
1958           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1959           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1960           continue;
1961
1962         case ISO_single_shift_2_7:
1963         case ISO_single_shift_2:
1964           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1965             goto label_invalid_code;
1966           /* SS2 is handled as an escape sequence of ESC 'N' */
1967           c1 = 'N';
1968           goto label_escape_sequence;
1969
1970         case ISO_single_shift_3:
1971           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1972             goto label_invalid_code;
1973           /* SS2 is handled as an escape sequence of ESC 'O' */
1974           c1 = 'O';
1975           goto label_escape_sequence;
1976
1977         case ISO_control_sequence_introducer:
1978           /* CSI is handled as an escape sequence of ESC '[' ...  */
1979           c1 = '[';
1980           goto label_escape_sequence;
1981
1982         case ISO_escape:
1983           ONE_MORE_BYTE (c1);
1984         label_escape_sequence:
1985           /* Escape sequences handled by Emacs are invocation,
1986              designation, direction specification, and character
1987              composition specification.  */
1988           switch (c1)
1989             {
1990             case '&':           /* revision of following character set */
1991               ONE_MORE_BYTE (c1);
1992               if (!(c1 >= '@' && c1 <= '~'))
1993                 goto label_invalid_code;
1994               ONE_MORE_BYTE (c1);
1995               if (c1 != ISO_CODE_ESC)
1996                 goto label_invalid_code;
1997               ONE_MORE_BYTE (c1);
1998               goto label_escape_sequence;
1999
2000             case '$':           /* designation of 2-byte character set */
2001               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2002                 goto label_invalid_code;
2003               ONE_MORE_BYTE (c1);
2004               if (c1 >= '@' && c1 <= 'B')
2005                 {       /* designation of JISX0208.1978, GB2312.1980,
2006                            or JISX0208.1980 */
2007                   DECODE_DESIGNATION (0, 2, 94, c1);
2008                 }
2009               else if (c1 >= 0x28 && c1 <= 0x2B)
2010                 {       /* designation of DIMENSION2_CHARS94 character set */
2011                   ONE_MORE_BYTE (c2);
2012                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2013                 }
2014               else if (c1 >= 0x2C && c1 <= 0x2F)
2015                 {       /* designation of DIMENSION2_CHARS96 character set */
2016                   ONE_MORE_BYTE (c2);
2017                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2018                 }
2019               else
2020                 goto label_invalid_code;
2021               /* We must update these variables now.  */
2022               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2023               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2024               continue;
2025
2026             case 'n':           /* invocation of locking-shift-2 */
2027               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2028                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2029                 goto label_invalid_code;
2030               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2031               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2032               continue;
2033
2034             case 'o':           /* invocation of locking-shift-3 */
2035               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2036                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2037                 goto label_invalid_code;
2038               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2039               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2040               continue;
2041
2042             case 'N':           /* invocation of single-shift-2 */
2043               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2044                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2045                 goto label_invalid_code;
2046               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2047               ONE_MORE_BYTE (c1);
2048               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2049                 goto label_invalid_code;
2050               break;
2051
2052             case 'O':           /* invocation of single-shift-3 */
2053               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2054                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2055                 goto label_invalid_code;
2056               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2057               ONE_MORE_BYTE (c1);
2058               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2059                 goto label_invalid_code;
2060               break;
2061
2062             case '0': case '2': case '3': case '4': /* start composition */
2063               DECODE_COMPOSITION_START (c1);
2064               continue;
2065
2066             case '1':           /* end composition */
2067               DECODE_COMPOSITION_END (c1);
2068               continue;
2069
2070             case '[':           /* specification of direction */
2071               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2072                 goto label_invalid_code;
2073               /* For the moment, nested direction is not supported.
2074                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2075                  left-to-right, and nonzero means right-to-left.  */
2076               ONE_MORE_BYTE (c1);
2077               switch (c1)
2078                 {
2079                 case ']':       /* end of the current direction */
2080                   coding->mode &= ~CODING_MODE_DIRECTION;
2081
2082                 case '0':       /* end of the current direction */
2083                 case '1':       /* start of left-to-right direction */
2084                   ONE_MORE_BYTE (c1);
2085                   if (c1 == ']')
2086                     coding->mode &= ~CODING_MODE_DIRECTION;
2087                   else
2088                     goto label_invalid_code;
2089                   break;
2090
2091                 case '2':       /* start of right-to-left direction */
2092                   ONE_MORE_BYTE (c1);
2093                   if (c1 == ']')
2094                     coding->mode |= CODING_MODE_DIRECTION;
2095                   else
2096                     goto label_invalid_code;
2097                   break;
2098
2099                 default:
2100                   goto label_invalid_code;
2101                 }
2102               continue;
2103
2104             case '%':
2105               if (COMPOSING_P (coding))
2106                 DECODE_COMPOSITION_END ('1');
2107               ONE_MORE_BYTE (c1);
2108               if (c1 == '/')
2109                 {
2110                   /* CTEXT extended segment:
2111                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2112                      We keep these bytes as is for the moment.
2113                      They may be decoded by post-read-conversion.  */
2114                   int dim, M, L;
2115                   int size, required;
2116                   int produced_chars;
2117
2118                   ONE_MORE_BYTE (dim);
2119                   ONE_MORE_BYTE (M);
2120                   ONE_MORE_BYTE (L);
2121                   size = ((M - 128) * 128) + (L - 128);
2122                   required = 8 + size * 2;
2123                   if (dst + required > (dst_bytes ? dst_end : src))
2124                     goto label_end_of_loop;
2125                   *dst++ = ISO_CODE_ESC;
2126                   *dst++ = '%';
2127                   *dst++ = '/';
2128                   *dst++ = dim;
2129                   produced_chars = 4;
2130                   dst += CHAR_STRING (M, dst), produced_chars++;
2131                   dst += CHAR_STRING (L, dst), produced_chars++;
2132                   while (size-- > 0)
2133                     {
2134                       ONE_MORE_BYTE (c1);
2135                       dst += CHAR_STRING (c1, dst), produced_chars++;
2136                     }
2137                   coding->produced_char += produced_chars;
2138                 }
2139               else if (c1 == 'G')
2140                 {
2141                   unsigned char *d = dst;
2142                   int produced_chars;
2143
2144                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2145                      ESC % G --UTF-8-BYTES-- ESC % @
2146                      We keep these bytes as is for the moment.
2147                      They may be decoded by post-read-conversion.  */
2148                   if (d + 6 > (dst_bytes ? dst_end : src))
2149                     goto label_end_of_loop;
2150                   *d++ = ISO_CODE_ESC;
2151                   *d++ = '%';
2152                   *d++ = 'G';
2153                   produced_chars = 3;
2154                   while (d + 1 < (dst_bytes ? dst_end : src))
2155                     {
2156                       ONE_MORE_BYTE (c1);
2157                       if (c1 == ISO_CODE_ESC
2158                           && src + 1 < src_end
2159                           && src[0] == '%'
2160                           && src[1] == '@')
2161                         {
2162                           src += 2;
2163                           break;
2164                         }
2165                       d += CHAR_STRING (c1, d), produced_chars++;
2166                     }
2167                   if (d + 3 > (dst_bytes ? dst_end : src))
2168                     goto label_end_of_loop;
2169                   *d++ = ISO_CODE_ESC;
2170                   *d++ = '%';
2171                   *d++ = '@';
2172                   dst = d;
2173                   coding->produced_char += produced_chars + 3;
2174                 }
2175               else
2176                 goto label_invalid_code;
2177               continue;
2178
2179             default:
2180               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2181                 goto label_invalid_code;
2182               if (c1 >= 0x28 && c1 <= 0x2B)
2183                 {       /* designation of DIMENSION1_CHARS94 character set */
2184                   ONE_MORE_BYTE (c2);
2185                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2186                 }
2187               else if (c1 >= 0x2C && c1 <= 0x2F)
2188                 {       /* designation of DIMENSION1_CHARS96 character set */
2189                   ONE_MORE_BYTE (c2);
2190                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2191                 }
2192               else
2193                 goto label_invalid_code;
2194               /* We must update these variables now.  */
2195               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2196               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2197               continue;
2198             }
2199         }
2200
2201       /* Now we know CHARSET and 1st position code C1 of a character.
2202          Produce a multibyte sequence for that character while getting
2203          2nd position code C2 if necessary.  */
2204       if (CHARSET_DIMENSION (charset) == 2)
2205         {
2206           ONE_MORE_BYTE (c2);
2207           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2208             /* C2 is not in a valid range.  */
2209             goto label_invalid_code;
2210         }
2211       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2212       EMIT_CHAR (c);
2213       continue;
2214
2215     label_invalid_code:
2216       coding->errors++;
2217       if (COMPOSING_P (coding))
2218         DECODE_COMPOSITION_END ('1');
2219       src = src_base;
2220       c = *src++;
2221       if (! NILP (translation_table))
2222         c = translate_char (translation_table, c, 0, 0, 0);
2223       EMIT_CHAR (c);
2224     }
2225
2226  label_end_of_loop:
2227   coding->consumed = coding->consumed_char = src_base - source;
2228   coding->produced = dst - destination;
2229   return;
2230 }
2231
2232
2233 /* ISO2022 encoding stuff.  */
2234
2235 /*
2236    It is not enough to say just "ISO2022" on encoding, we have to
2237    specify more details.  In Emacs, each ISO2022 coding system
2238    variant has the following specifications:
2239         1. Initial designation to G0 through G3.
2240         2. Allows short-form designation?
2241         3. ASCII should be designated to G0 before control characters?
2242         4. ASCII should be designated to G0 at end of line?
2243         5. 7-bit environment or 8-bit environment?
2244         6. Use locking-shift?
2245         7. Use Single-shift?
2246    And the following two are only for Japanese:
2247         8. Use ASCII in place of JIS0201-1976-Roman?
2248         9. Use JISX0208-1983 in place of JISX0208-1978?
2249    These specifications are encoded in `coding->flags' as flag bits
2250    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2251    details.
2252 */
2253
2254 /* Produce codes (escape sequence) for designating CHARSET to graphic
2255    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2256    '@', 'A', or 'B' and the coding system CODING allows, produce
2257    designation sequence of short-form.  */
2258
2259 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2260   do {                                                                  \
2261     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2262     char *intermediate_char_94 = "()*+";                                \
2263     char *intermediate_char_96 = ",-./";                                \
2264     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2265                                                                         \
2266     if (revision < 255)                                                 \
2267       {                                                                 \
2268         *dst++ = ISO_CODE_ESC;                                          \
2269         *dst++ = '&';                                                   \
2270         *dst++ = '@' + revision;                                        \
2271       }                                                                 \
2272     *dst++ = ISO_CODE_ESC;                                              \
2273     if (CHARSET_DIMENSION (charset) == 1)                               \
2274       {                                                                 \
2275         if (CHARSET_CHARS (charset) == 94)                              \
2276           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2277         else                                                            \
2278           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2279       }                                                                 \
2280     else                                                                \
2281       {                                                                 \
2282         *dst++ = '$';                                                   \
2283         if (CHARSET_CHARS (charset) == 94)                              \
2284           {                                                             \
2285             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2286                 || reg != 0                                             \
2287                 || final_char < '@' || final_char > 'B')                \
2288               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2289           }                                                             \
2290         else                                                            \
2291           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2292       }                                                                 \
2293     *dst++ = final_char;                                                \
2294     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2295   } while (0)
2296
2297 /* The following two macros produce codes (control character or escape
2298    sequence) for ISO2022 single-shift functions (single-shift-2 and
2299    single-shift-3).  */
2300
2301 #define ENCODE_SINGLE_SHIFT_2                           \
2302   do {                                                  \
2303     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2304       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2305     else                                                \
2306       *dst++ = ISO_CODE_SS2;                            \
2307     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2308   } while (0)
2309
2310 #define ENCODE_SINGLE_SHIFT_3                           \
2311   do {                                                  \
2312     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2313       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2314     else                                                \
2315       *dst++ = ISO_CODE_SS3;                            \
2316     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2317   } while (0)
2318
2319 /* The following four macros produce codes (control character or
2320    escape sequence) for ISO2022 locking-shift functions (shift-in,
2321    shift-out, locking-shift-2, and locking-shift-3).  */
2322
2323 #define ENCODE_SHIFT_IN                         \
2324   do {                                          \
2325     *dst++ = ISO_CODE_SI;                       \
2326     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2327   } while (0)
2328
2329 #define ENCODE_SHIFT_OUT                        \
2330   do {                                          \
2331     *dst++ = ISO_CODE_SO;                       \
2332     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2333   } while (0)
2334
2335 #define ENCODE_LOCKING_SHIFT_2                  \
2336   do {                                          \
2337     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2338     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2339   } while (0)
2340
2341 #define ENCODE_LOCKING_SHIFT_3                  \
2342   do {                                          \
2343     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2344     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2345   } while (0)
2346
2347 /* Produce codes for a DIMENSION1 character whose character set is
2348    CHARSET and whose position-code is C1.  Designation and invocation
2349    sequences are also produced in advance if necessary.  */
2350
2351 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2352   do {                                                                  \
2353     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2354       {                                                                 \
2355         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2356           *dst++ = c1 & 0x7F;                                           \
2357         else                                                            \
2358           *dst++ = c1 | 0x80;                                           \
2359         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2360         break;                                                          \
2361       }                                                                 \
2362     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2363       {                                                                 \
2364         *dst++ = c1 & 0x7F;                                             \
2365         break;                                                          \
2366       }                                                                 \
2367     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2368       {                                                                 \
2369         *dst++ = c1 | 0x80;                                             \
2370         break;                                                          \
2371       }                                                                 \
2372     else                                                                \
2373       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2374          must invoke it, or, at first, designate it to some graphic     \
2375          register.  Then repeat the loop to actually produce the        \
2376          character.  */                                                 \
2377       dst = encode_invocation_designation (charset, coding, dst);       \
2378   } while (1)
2379
2380 /* Produce codes for a DIMENSION2 character whose character set is
2381    CHARSET and whose position-codes are C1 and C2.  Designation and
2382    invocation codes are also produced in advance if necessary.  */
2383
2384 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2385   do {                                                                  \
2386     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2387       {                                                                 \
2388         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2389           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2390         else                                                            \
2391           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2392         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2393         break;                                                          \
2394       }                                                                 \
2395     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2396       {                                                                 \
2397         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2398         break;                                                          \
2399       }                                                                 \
2400     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2401       {                                                                 \
2402         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2403         break;                                                          \
2404       }                                                                 \
2405     else                                                                \
2406       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2407          must invoke it, or, at first, designate it to some graphic     \
2408          register.  Then repeat the loop to actually produce the        \
2409          character.  */                                                 \
2410       dst = encode_invocation_designation (charset, coding, dst);       \
2411   } while (1)
2412
2413 #define ENCODE_ISO_CHARACTER(c)                                 \
2414   do {                                                          \
2415     int charset, c1, c2;                                        \
2416                                                                 \
2417     SPLIT_CHAR (c, charset, c1, c2);                            \
2418     if (CHARSET_DEFINED_P (charset))                            \
2419       {                                                         \
2420         if (CHARSET_DIMENSION (charset) == 1)                   \
2421           {                                                     \
2422             if (charset == CHARSET_ASCII                        \
2423                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2424               charset = charset_latin_jisx0201;                 \
2425             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2426           }                                                     \
2427         else                                                    \
2428           {                                                     \
2429             if (charset == charset_jisx0208                     \
2430                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2431               charset = charset_jisx0208_1978;                  \
2432             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2433           }                                                     \
2434       }                                                         \
2435     else                                                        \
2436       {                                                         \
2437         *dst++ = c1;                                            \
2438         if (c2 >= 0)                                            \
2439           *dst++ = c2;                                          \
2440       }                                                         \
2441   } while (0)
2442
2443
2444 /* Instead of encoding character C, produce one or two `?'s.  */
2445
2446 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2447   do {                                                          \
2448     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2449     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2450       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2451   } while (0)
2452
2453
2454 /* Produce designation and invocation codes at a place pointed by DST
2455    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2456    Return new DST.  */
2457
2458 unsigned char *
2459 encode_invocation_designation (charset, coding, dst)
2460      int charset;
2461      struct coding_system *coding;
2462      unsigned char *dst;
2463 {
2464   int reg;                      /* graphic register number */
2465
2466   /* At first, check designations.  */
2467   for (reg = 0; reg < 4; reg++)
2468     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2469       break;
2470
2471   if (reg >= 4)
2472     {
2473       /* CHARSET is not yet designated to any graphic registers.  */
2474       /* At first check the requested designation.  */
2475       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2476       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2477         /* Since CHARSET requests no special designation, designate it
2478            to graphic register 0.  */
2479         reg = 0;
2480
2481       ENCODE_DESIGNATION (charset, reg, coding);
2482     }
2483
2484   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2485       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2486     {
2487       /* Since the graphic register REG is not invoked to any graphic
2488          planes, invoke it to graphic plane 0.  */
2489       switch (reg)
2490         {
2491         case 0:                 /* graphic register 0 */
2492           ENCODE_SHIFT_IN;
2493           break;
2494
2495         case 1:                 /* graphic register 1 */
2496           ENCODE_SHIFT_OUT;
2497           break;
2498
2499         case 2:                 /* graphic register 2 */
2500           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2501             ENCODE_SINGLE_SHIFT_2;
2502           else
2503             ENCODE_LOCKING_SHIFT_2;
2504           break;
2505
2506         case 3:                 /* graphic register 3 */
2507           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2508             ENCODE_SINGLE_SHIFT_3;
2509           else
2510             ENCODE_LOCKING_SHIFT_3;
2511           break;
2512         }
2513     }
2514
2515   return dst;
2516 }
2517
2518 /* Produce 2-byte codes for encoded composition rule RULE.  */
2519
2520 #define ENCODE_COMPOSITION_RULE(rule)           \
2521   do {                                          \
2522     int gref, nref;                             \
2523     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2524     *dst++ = 32 + 81 + gref;                    \
2525     *dst++ = 32 + nref;                         \
2526   } while (0)
2527
2528 /* Produce codes for indicating the start of a composition sequence
2529    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2530    which specify information about the composition.  See the comment
2531    in coding.h for the format of DATA.  */
2532
2533 #define ENCODE_COMPOSITION_START(coding, data)                          \
2534   do {                                                                  \
2535     coding->composing = data[3];                                        \
2536     *dst++ = ISO_CODE_ESC;                                              \
2537     if (coding->composing == COMPOSITION_RELATIVE)                      \
2538       *dst++ = '0';                                                     \
2539     else                                                                \
2540       {                                                                 \
2541         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2542                   ? '3' : '4');                                         \
2543         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2544         coding->composition_rule_follows = 0;                           \
2545       }                                                                 \
2546   } while (0)
2547
2548 /* Produce codes for indicating the end of the current composition.  */
2549
2550 #define ENCODE_COMPOSITION_END(coding, data)                    \
2551   do {                                                          \
2552     *dst++ = ISO_CODE_ESC;                                      \
2553     *dst++ = '1';                                               \
2554     coding->cmp_data_start += data[0];                          \
2555     coding->composing = COMPOSITION_NO;                         \
2556     if (coding->cmp_data_start == coding->cmp_data->used        \
2557         && coding->cmp_data->next)                              \
2558       {                                                         \
2559         coding->cmp_data = coding->cmp_data->next;              \
2560         coding->cmp_data_start = 0;                             \
2561       }                                                         \
2562   } while (0)
2563
2564 /* Produce composition start sequence ESC 0.  Here, this sequence
2565    doesn't mean the start of a new composition but means that we have
2566    just produced components (alternate chars and composition rules) of
2567    the composition and the actual text follows in SRC.  */
2568
2569 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2570   do {                                          \
2571     *dst++ = ISO_CODE_ESC;                      \
2572     *dst++ = '0';                               \
2573     coding->composing = COMPOSITION_RELATIVE;   \
2574   } while (0)
2575
2576 /* The following three macros produce codes for indicating direction
2577    of text.  */
2578 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2579   do {                                                  \
2580     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2581       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2582     else                                                \
2583       *dst++ = ISO_CODE_CSI;                            \
2584   } while (0)
2585
2586 #define ENCODE_DIRECTION_R2L    \
2587   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2588
2589 #define ENCODE_DIRECTION_L2R    \
2590   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2591
2592 /* Produce codes for designation and invocation to reset the graphic
2593    planes and registers to initial state.  */
2594 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2595   do {                                                                      \
2596     int reg;                                                                \
2597     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2598       ENCODE_SHIFT_IN;                                                      \
2599     for (reg = 0; reg < 4; reg++)                                           \
2600       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2601           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2602               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2603         ENCODE_DESIGNATION                                                  \
2604           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2605   } while (0)
2606
2607 /* Produce designation sequences of charsets in the line started from
2608    SRC to a place pointed by DST, and return updated DST.
2609
2610    If the current block ends before any end-of-line, we may fail to
2611    find all the necessary designations.  */
2612
2613 static unsigned char *
2614 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2615      struct coding_system *coding;
2616      Lisp_Object translation_table;
2617      const unsigned char *src, *src_end;
2618      unsigned char *dst;
2619 {
2620   int charset, c, found = 0, reg;
2621   /* Table of charsets to be designated to each graphic register.  */
2622   int r[4];
2623
2624   for (reg = 0; reg < 4; reg++)
2625     r[reg] = -1;
2626
2627   while (found < 4)
2628     {
2629       ONE_MORE_CHAR (c);
2630       if (c == '\n')
2631         break;
2632
2633       charset = CHAR_CHARSET (c);
2634       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2635       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2636         {
2637           found++;
2638           r[reg] = charset;
2639         }
2640     }
2641
2642  label_end_of_loop:
2643   if (found)
2644     {
2645       for (reg = 0; reg < 4; reg++)
2646         if (r[reg] >= 0
2647             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2648           ENCODE_DESIGNATION (r[reg], reg, coding);
2649     }
2650
2651   return dst;
2652 }
2653
2654 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2655
2656 static void
2657 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2658      struct coding_system *coding;
2659      const unsigned char *source;
2660      unsigned char *destination;
2661      int src_bytes, dst_bytes;
2662 {
2663   const unsigned char *src = source;
2664   const unsigned char *src_end = source + src_bytes;
2665   unsigned char *dst = destination;
2666   unsigned char *dst_end = destination + dst_bytes;
2667   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2668      from DST_END to assure overflow checking is necessary only at the
2669      head of loop.  */
2670   unsigned char *adjusted_dst_end = dst_end - 19;
2671   /* SRC_BASE remembers the start position in source in each loop.
2672      The loop will be exited when there's not enough source text to
2673      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2674      there's not enough destination area to produce encoded codes
2675      (within macro EMIT_BYTES).  */
2676   const unsigned char *src_base;
2677   int c;
2678   Lisp_Object translation_table;
2679   Lisp_Object safe_chars;
2680
2681   if (coding->flags & CODING_FLAG_ISO_SAFE)
2682     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2683
2684   safe_chars = coding_safe_chars (coding->symbol);
2685
2686   if (NILP (Venable_character_translation))
2687     translation_table = Qnil;
2688   else
2689     {
2690       translation_table = coding->translation_table_for_encode;
2691       if (NILP (translation_table))
2692         translation_table = Vstandard_translation_table_for_encode;
2693     }
2694
2695   coding->consumed_char = 0;
2696   coding->errors = 0;
2697   while (1)
2698     {
2699       src_base = src;
2700
2701       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2702         {
2703           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2704           break;
2705         }
2706
2707       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2708           && CODING_SPEC_ISO_BOL (coding))
2709         {
2710           /* We have to produce designation sequences if any now.  */
2711           dst = encode_designation_at_bol (coding, translation_table,
2712                                            src, src_end, dst);
2713           CODING_SPEC_ISO_BOL (coding) = 0;
2714         }
2715
2716       /* Check composition start and end.  */
2717       if (coding->composing != COMPOSITION_DISABLED
2718           && coding->cmp_data_start < coding->cmp_data->used)
2719         {
2720           struct composition_data *cmp_data = coding->cmp_data;
2721           int *data = cmp_data->data + coding->cmp_data_start;
2722           int this_pos = cmp_data->char_offset + coding->consumed_char;
2723
2724           if (coding->composing == COMPOSITION_RELATIVE)
2725             {
2726               if (this_pos == data[2])
2727                 {
2728                   ENCODE_COMPOSITION_END (coding, data);
2729                   cmp_data = coding->cmp_data;
2730                   data = cmp_data->data + coding->cmp_data_start;
2731                 }
2732             }
2733           else if (COMPOSING_P (coding))
2734             {
2735               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2736               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2737                 /* We have consumed components of the composition.
2738                    What follows in SRC is the composition's base
2739                    text.  */
2740                 ENCODE_COMPOSITION_FAKE_START (coding);
2741               else
2742                 {
2743                   int c = cmp_data->data[coding->cmp_data_index++];
2744                   if (coding->composition_rule_follows)
2745                     {
2746                       ENCODE_COMPOSITION_RULE (c);
2747                       coding->composition_rule_follows = 0;
2748                     }
2749                   else
2750                     {
2751                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2752                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2753                         ENCODE_UNSAFE_CHARACTER (c);
2754                       else
2755                         ENCODE_ISO_CHARACTER (c);
2756                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2757                         coding->composition_rule_follows = 1;
2758                     }
2759                   continue;
2760                 }
2761             }
2762           if (!COMPOSING_P (coding))
2763             {
2764               if (this_pos == data[1])
2765                 {
2766                   ENCODE_COMPOSITION_START (coding, data);
2767                   continue;
2768                 }
2769             }
2770         }
2771
2772       ONE_MORE_CHAR (c);
2773
2774       /* Now encode the character C.  */
2775       if (c < 0x20 || c == 0x7F)
2776         {
2777           if (c == '\r')
2778             {
2779               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2780                 {
2781                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2782                     ENCODE_RESET_PLANE_AND_REGISTER;
2783                   *dst++ = c;
2784                   continue;
2785                 }
2786               /* fall down to treat '\r' as '\n' ...  */
2787               c = '\n';
2788             }
2789           if (c == '\n')
2790             {
2791               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2792                 ENCODE_RESET_PLANE_AND_REGISTER;
2793               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2794                 bcopy (coding->spec.iso2022.initial_designation,
2795                        coding->spec.iso2022.current_designation,
2796                        sizeof coding->spec.iso2022.initial_designation);
2797               if (coding->eol_type == CODING_EOL_LF
2798                   || coding->eol_type == CODING_EOL_UNDECIDED)
2799                 *dst++ = ISO_CODE_LF;
2800               else if (coding->eol_type == CODING_EOL_CRLF)
2801                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2802               else
2803                 *dst++ = ISO_CODE_CR;
2804               CODING_SPEC_ISO_BOL (coding) = 1;
2805             }
2806           else
2807             {
2808               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2809                 ENCODE_RESET_PLANE_AND_REGISTER;
2810               *dst++ = c;
2811             }
2812         }
2813       else if (ASCII_BYTE_P (c))
2814         ENCODE_ISO_CHARACTER (c);
2815       else if (SINGLE_BYTE_CHAR_P (c))
2816         {
2817           *dst++ = c;
2818           coding->errors++;
2819         }
2820       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2821                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2822         ENCODE_UNSAFE_CHARACTER (c);
2823       else
2824         ENCODE_ISO_CHARACTER (c);
2825
2826       coding->consumed_char++;
2827     }
2828
2829  label_end_of_loop:
2830   coding->consumed = src_base - source;
2831   coding->produced = coding->produced_char = dst - destination;
2832 }
2833
2834 \f
2835 /*** 4. SJIS and BIG5 handlers ***/
2836
2837 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2838    quite widely.  So, for the moment, Emacs supports them in the bare
2839    C code.  But, in the future, they may be supported only by CCL.  */
2840
2841 /* SJIS is a coding system encoding three character sets: ASCII, right
2842    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2843    as is.  A character of charset katakana-jisx0201 is encoded by
2844    "position-code + 0x80".  A character of charset japanese-jisx0208
2845    is encoded in 2-byte but two position-codes are divided and shifted
2846    so that it fits in the range below.
2847
2848    --- CODE RANGE of SJIS ---
2849    (character set)      (range)
2850    ASCII                0x00 .. 0x7F
2851    KATAKANA-JISX0201    0xA1 .. 0xDF
2852    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2853             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2854    -------------------------------
2855
2856 */
2857
2858 /* BIG5 is a coding system encoding two character sets: ASCII and
2859    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2860    character set and is encoded in two bytes.
2861
2862    --- CODE RANGE of BIG5 ---
2863    (character set)      (range)
2864    ASCII                0x00 .. 0x7F
2865    Big5 (1st byte)      0xA1 .. 0xFE
2866         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2867    --------------------------
2868
2869    Since the number of characters in Big5 is larger than maximum
2870    characters in Emacs' charset (96x96), it can't be handled as one
2871    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2872    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2873    contains frequently used characters and the latter contains less
2874    frequently used characters.  */
2875
2876 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2877    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2878    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2879    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2880
2881 /* Number of Big5 characters which have the same code in 1st byte.  */
2882 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2883
2884 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2885   do {                                                                  \
2886     unsigned int temp                                                   \
2887       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2888     if (b1 < 0xC9)                                                      \
2889       charset = charset_big5_1;                                         \
2890     else                                                                \
2891       {                                                                 \
2892         charset = charset_big5_2;                                       \
2893         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2894       }                                                                 \
2895     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2896     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2897   } while (0)
2898
2899 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2900   do {                                                                  \
2901     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2902     if (charset == charset_big5_2)                                      \
2903       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2904     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2905     b2 = temp % BIG5_SAME_ROW;                                          \
2906     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2907   } while (0)
2908
2909 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2910    Check if a text is encoded in SJIS.  If it is, return
2911    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2912
2913 static int
2914 detect_coding_sjis (src, src_end, multibytep)
2915      unsigned char *src, *src_end;
2916      int multibytep;
2917 {
2918   int c;
2919   /* Dummy for ONE_MORE_BYTE.  */
2920   struct coding_system dummy_coding;
2921   struct coding_system *coding = &dummy_coding;
2922
2923   while (1)
2924     {
2925       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2926       if (c < 0x80)
2927         continue;
2928       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2929         return 0;
2930       if (c <= 0x9F || c >= 0xE0)
2931         {
2932           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2933           if (c < 0x40 || c == 0x7F || c > 0xFC)
2934             return 0;
2935         }
2936     }
2937 }
2938
2939 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2940    Check if a text is encoded in BIG5.  If it is, return
2941    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2942
2943 static int
2944 detect_coding_big5 (src, src_end, multibytep)
2945      unsigned char *src, *src_end;
2946      int multibytep;
2947 {
2948   int c;
2949   /* Dummy for ONE_MORE_BYTE.  */
2950   struct coding_system dummy_coding;
2951   struct coding_system *coding = &dummy_coding;
2952
2953   while (1)
2954     {
2955       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2956       if (c < 0x80)
2957         continue;
2958       if (c < 0xA1 || c > 0xFE)
2959         return 0;
2960       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2961       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2962         return 0;
2963     }
2964 }
2965
2966 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2967    Check if a text is encoded in UTF-8.  If it is, return
2968    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2969
2970 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2971 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2972 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2973 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2974 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2975 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2976 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2977
2978 static int
2979 detect_coding_utf_8 (src, src_end, multibytep)
2980      unsigned char *src, *src_end;
2981      int multibytep;
2982 {
2983   unsigned char c;
2984   int seq_maybe_bytes;
2985   /* Dummy for ONE_MORE_BYTE.  */
2986   struct coding_system dummy_coding;
2987   struct coding_system *coding = &dummy_coding;
2988
2989   while (1)
2990     {
2991       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2992       if (UTF_8_1_OCTET_P (c))
2993         continue;
2994       else if (UTF_8_2_OCTET_LEADING_P (c))
2995         seq_maybe_bytes = 1;
2996       else if (UTF_8_3_OCTET_LEADING_P (c))
2997         seq_maybe_bytes = 2;
2998       else if (UTF_8_4_OCTET_LEADING_P (c))
2999         seq_maybe_bytes = 3;
3000       else if (UTF_8_5_OCTET_LEADING_P (c))
3001         seq_maybe_bytes = 4;
3002       else if (UTF_8_6_OCTET_LEADING_P (c))
3003         seq_maybe_bytes = 5;
3004       else
3005         return 0;
3006
3007       do
3008         {
3009           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3010           if (!UTF_8_EXTRA_OCTET_P (c))
3011             return 0;
3012           seq_maybe_bytes--;
3013         }
3014       while (seq_maybe_bytes > 0);
3015     }
3016 }
3017
3018 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3019    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3020    Little Endian (otherwise).  If it is, return
3021    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3022    else return 0.  */
3023
3024 #define UTF_16_INVALID_P(val)   \
3025   (((val) == 0xFFFE)            \
3026    || ((val) == 0xFFFF))
3027
3028 #define UTF_16_HIGH_SURROGATE_P(val) \
3029   (((val) & 0xD800) == 0xD800)
3030
3031 #define UTF_16_LOW_SURROGATE_P(val) \
3032   (((val) & 0xDC00) == 0xDC00)
3033
3034 static int
3035 detect_coding_utf_16 (src, src_end, multibytep)
3036      unsigned char *src, *src_end;
3037      int multibytep;
3038 {
3039   unsigned char c1, c2;
3040   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3041   struct coding_system dummy_coding;
3042   struct coding_system *coding = &dummy_coding;
3043
3044   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3045   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3046
3047   if ((c1 == 0xFF) && (c2 == 0xFE))
3048     return CODING_CATEGORY_MASK_UTF_16_LE;
3049   else if ((c1 == 0xFE) && (c2 == 0xFF))
3050     return CODING_CATEGORY_MASK_UTF_16_BE;
3051   return 0;
3052 }
3053
3054 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3055    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3056
3057 static void
3058 decode_coding_sjis_big5 (coding, source, destination,
3059                          src_bytes, dst_bytes, sjis_p)
3060      struct coding_system *coding;
3061      const unsigned char *source;
3062      unsigned char  *destination;
3063      int src_bytes, dst_bytes;
3064      int sjis_p;
3065 {
3066   const unsigned char *src = source;
3067   const unsigned char *src_end = source + src_bytes;
3068   unsigned char *dst = destination;
3069   unsigned char *dst_end = destination + dst_bytes;
3070   /* SRC_BASE remembers the start position in source in each loop.
3071      The loop will be exited when there's not enough source code
3072      (within macro ONE_MORE_BYTE), or when there's not enough
3073      destination area to produce a character (within macro
3074      EMIT_CHAR).  */
3075   const unsigned char *src_base;
3076   Lisp_Object translation_table;
3077
3078   if (NILP (Venable_character_translation))
3079     translation_table = Qnil;
3080   else
3081     {
3082       translation_table = coding->translation_table_for_decode;
3083       if (NILP (translation_table))
3084         translation_table = Vstandard_translation_table_for_decode;
3085     }
3086
3087   coding->produced_char = 0;
3088   while (1)
3089     {
3090       int c, charset, c1, c2 = 0;
3091
3092       src_base = src;
3093       ONE_MORE_BYTE (c1);
3094
3095       if (c1 < 0x80)
3096         {
3097           charset = CHARSET_ASCII;
3098           if (c1 < 0x20)
3099             {
3100               if (c1 == '\r')
3101                 {
3102                   if (coding->eol_type == CODING_EOL_CRLF)
3103                     {
3104                       ONE_MORE_BYTE (c2);
3105                       if (c2 == '\n')
3106                         c1 = c2;
3107                       else
3108                         /* To process C2 again, SRC is subtracted by 1.  */
3109                         src--;
3110                     }
3111                   else if (coding->eol_type == CODING_EOL_CR)
3112                     c1 = '\n';
3113                 }
3114               else if (c1 == '\n'
3115                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3116                        && (coding->eol_type == CODING_EOL_CR
3117                            || coding->eol_type == CODING_EOL_CRLF))
3118                 {
3119                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3120                   goto label_end_of_loop;
3121                 }
3122             }
3123         }
3124       else
3125         {
3126           if (sjis_p)
3127             {
3128               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3129                 goto label_invalid_code;
3130               if (c1 <= 0x9F || c1 >= 0xE0)
3131                 {
3132                   /* SJIS -> JISX0208 */
3133                   ONE_MORE_BYTE (c2);
3134                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3135                     goto label_invalid_code;
3136                   DECODE_SJIS (c1, c2, c1, c2);
3137                   charset = charset_jisx0208;
3138                 }
3139               else
3140                 /* SJIS -> JISX0201-Kana */
3141                 charset = charset_katakana_jisx0201;
3142             }
3143           else
3144             {
3145               /* BIG5 -> Big5 */
3146               if (c1 < 0xA0 || c1 > 0xFE)
3147                 goto label_invalid_code;
3148               ONE_MORE_BYTE (c2);
3149               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3150                 goto label_invalid_code;
3151               DECODE_BIG5 (c1, c2, charset, c1, c2);
3152             }
3153         }
3154
3155       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3156       EMIT_CHAR (c);
3157       continue;
3158
3159     label_invalid_code:
3160       coding->errors++;
3161       src = src_base;
3162       c = *src++;
3163       EMIT_CHAR (c);
3164     }
3165
3166  label_end_of_loop:
3167   coding->consumed = coding->consumed_char = src_base - source;
3168   coding->produced = dst - destination;
3169   return;
3170 }
3171
3172 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3173    This function can encode charsets `ascii', `katakana-jisx0201',
3174    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3175    are sure that all these charsets are registered as official charset
3176    (i.e. do not have extended leading-codes).  Characters of other
3177    charsets are produced without any encoding.  If SJIS_P is 1, encode
3178    SJIS text, else encode BIG5 text.  */
3179
3180 static void
3181 encode_coding_sjis_big5 (coding, source, destination,
3182                          src_bytes, dst_bytes, sjis_p)
3183      struct coding_system *coding;
3184      unsigned char *source, *destination;
3185      int src_bytes, dst_bytes;
3186      int sjis_p;
3187 {
3188   unsigned char *src = source;
3189   unsigned char *src_end = source + src_bytes;
3190   unsigned char *dst = destination;
3191   unsigned char *dst_end = destination + dst_bytes;
3192   /* SRC_BASE remembers the start position in source in each loop.
3193      The loop will be exited when there's not enough source text to
3194      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3195      there's not enough destination area to produce encoded codes
3196      (within macro EMIT_BYTES).  */
3197   unsigned char *src_base;
3198   Lisp_Object translation_table;
3199
3200   if (NILP (Venable_character_translation))
3201     translation_table = Qnil;
3202   else
3203     {
3204       translation_table = coding->translation_table_for_encode;
3205       if (NILP (translation_table))
3206         translation_table = Vstandard_translation_table_for_encode;
3207     }
3208
3209   while (1)
3210     {
3211       int c, charset, c1, c2;
3212
3213       src_base = src;
3214       ONE_MORE_CHAR (c);
3215
3216       /* Now encode the character C.  */
3217       if (SINGLE_BYTE_CHAR_P (c))
3218         {
3219           switch (c)
3220             {
3221             case '\r':
3222               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3223                 {
3224                   EMIT_ONE_BYTE (c);
3225                   break;
3226                 }
3227               c = '\n';
3228             case '\n':
3229               if (coding->eol_type == CODING_EOL_CRLF)
3230                 {
3231                   EMIT_TWO_BYTES ('\r', c);
3232                   break;
3233                 }
3234               else if (coding->eol_type == CODING_EOL_CR)
3235                 c = '\r';
3236             default:
3237               EMIT_ONE_BYTE (c);
3238             }
3239         }
3240       else
3241         {
3242           SPLIT_CHAR (c, charset, c1, c2);
3243           if (sjis_p)
3244             {
3245               if (charset == charset_jisx0208
3246                   || charset == charset_jisx0208_1978)
3247                 {
3248                   ENCODE_SJIS (c1, c2, c1, c2);
3249                   EMIT_TWO_BYTES (c1, c2);
3250                 }
3251               else if (charset == charset_katakana_jisx0201)
3252                 EMIT_ONE_BYTE (c1 | 0x80);
3253               else if (charset == charset_latin_jisx0201)
3254                 EMIT_ONE_BYTE (c1);
3255               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3256                 {
3257                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3258                   if (CHARSET_WIDTH (charset) > 1)
3259                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3260                 }
3261               else
3262                 /* There's no way other than producing the internal
3263                    codes as is.  */
3264                 EMIT_BYTES (src_base, src);
3265             }
3266           else
3267             {
3268               if (charset == charset_big5_1 || charset == charset_big5_2)
3269                 {
3270                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3271                   EMIT_TWO_BYTES (c1, c2);
3272                 }
3273               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3274                 {
3275                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3276                   if (CHARSET_WIDTH (charset) > 1)
3277                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3278                 }
3279               else
3280                 /* There's no way other than producing the internal
3281                    codes as is.  */
3282                 EMIT_BYTES (src_base, src);
3283             }
3284         }
3285       coding->consumed_char++;
3286     }
3287
3288  label_end_of_loop:
3289   coding->consumed = src_base - source;
3290   coding->produced = coding->produced_char = dst - destination;
3291 }
3292
3293 \f
3294 /*** 5. CCL handlers ***/
3295
3296 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3297    Check if a text is encoded in a coding system of which
3298    encoder/decoder are written in CCL program.  If it is, return
3299    CODING_CATEGORY_MASK_CCL, else return 0.  */
3300
3301 static int
3302 detect_coding_ccl (src, src_end, multibytep)
3303      unsigned char *src, *src_end;
3304      int multibytep;
3305 {
3306   unsigned char *valid;
3307   int c;
3308   /* Dummy for ONE_MORE_BYTE.  */
3309   struct coding_system dummy_coding;
3310   struct coding_system *coding = &dummy_coding;
3311
3312   /* No coding system is assigned to coding-category-ccl.  */
3313   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3314     return 0;
3315
3316   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3317   while (1)
3318     {
3319       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3320       if (! valid[c])
3321         return 0;
3322     }
3323 }
3324
3325 \f
3326 /*** 6. End-of-line handlers ***/
3327
3328 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3329
3330 static void
3331 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3332      struct coding_system *coding;
3333      const unsigned char *source;
3334      unsigned char *destination;
3335      int src_bytes, dst_bytes;
3336 {
3337   const unsigned char *src = source;
3338   unsigned char *dst = destination;
3339   const unsigned char *src_end = src + src_bytes;
3340   unsigned char *dst_end = dst + dst_bytes;
3341   Lisp_Object translation_table;
3342   /* SRC_BASE remembers the start position in source in each loop.
3343      The loop will be exited when there's not enough source code
3344      (within macro ONE_MORE_BYTE), or when there's not enough
3345      destination area to produce a character (within macro
3346      EMIT_CHAR).  */
3347   const unsigned char *src_base;
3348   int c;
3349
3350   translation_table = Qnil;
3351   switch (coding->eol_type)
3352     {
3353     case CODING_EOL_CRLF:
3354       while (1)
3355         {
3356           src_base = src;
3357           ONE_MORE_BYTE (c);
3358           if (c == '\r')
3359             {
3360               ONE_MORE_BYTE (c);
3361               if (c != '\n')
3362                 {
3363                   src--;
3364                   c = '\r';
3365                 }
3366             }
3367           else if (c == '\n'
3368                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3369             {
3370               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3371               goto label_end_of_loop;
3372             }
3373           EMIT_CHAR (c);
3374         }
3375       break;
3376
3377     case CODING_EOL_CR:
3378       while (1)
3379         {
3380           src_base = src;
3381           ONE_MORE_BYTE (c);
3382           if (c == '\n')
3383             {
3384               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3385                 {
3386                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3387                   goto label_end_of_loop;
3388                 }
3389             }
3390           else if (c == '\r')
3391             c = '\n';
3392           EMIT_CHAR (c);
3393         }
3394       break;
3395
3396     default:                    /* no need for EOL handling */
3397       while (1)
3398         {
3399           src_base = src;
3400           ONE_MORE_BYTE (c);
3401           EMIT_CHAR (c);
3402         }
3403     }
3404
3405  label_end_of_loop:
3406   coding->consumed = coding->consumed_char = src_base - source;
3407   coding->produced = dst - destination;
3408   return;
3409 }
3410
3411 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3412    format of end-of-line according to `coding->eol_type'.  It also
3413    convert multibyte form 8-bit characters to unibyte if
3414    CODING->src_multibyte is nonzero.  If `coding->mode &
3415    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3416    also means end-of-line.  */
3417
3418 static void
3419 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3420      struct coding_system *coding;
3421      const unsigned char *source;
3422      unsigned char *destination;
3423      int src_bytes, dst_bytes;
3424 {
3425   const unsigned char *src = source;
3426   unsigned char *dst = destination;
3427   const unsigned char *src_end = src + src_bytes;
3428   unsigned char *dst_end = dst + dst_bytes;
3429   Lisp_Object translation_table;
3430   /* SRC_BASE remembers the start position in source in each loop.
3431      The loop will be exited when there's not enough source text to
3432      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3433      there's not enough destination area to produce encoded codes
3434      (within macro EMIT_BYTES).  */
3435   const unsigned char *src_base;
3436   unsigned char *tmp;
3437   int c;
3438   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3439
3440   translation_table = Qnil;
3441   if (coding->src_multibyte
3442       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3443     {
3444       src_end--;
3445       src_bytes--;
3446       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3447     }
3448
3449   if (coding->eol_type == CODING_EOL_CRLF)
3450     {
3451       while (src < src_end)
3452         {
3453           src_base = src;
3454           c = *src++;
3455           if (c >= 0x20)
3456             EMIT_ONE_BYTE (c);
3457           else if (c == '\n' || (c == '\r' && selective_display))
3458             EMIT_TWO_BYTES ('\r', '\n');
3459           else
3460             EMIT_ONE_BYTE (c);
3461         }
3462       src_base = src;
3463     label_end_of_loop:
3464       ;
3465     }
3466   else
3467     {
3468       if (!dst_bytes || src_bytes <= dst_bytes)
3469         {
3470           safe_bcopy (src, dst, src_bytes);
3471           src_base = src_end;
3472           dst += src_bytes;
3473         }
3474       else
3475         {
3476           if (coding->src_multibyte
3477               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3478             dst_bytes--;
3479           safe_bcopy (src, dst, dst_bytes);
3480           src_base = src + dst_bytes;
3481           dst = destination + dst_bytes;
3482           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3483         }
3484       if (coding->eol_type == CODING_EOL_CR)
3485         {
3486           for (tmp = destination; tmp < dst; tmp++)
3487             if (*tmp == '\n') *tmp = '\r';
3488         }
3489       else if (selective_display)
3490         {
3491           for (tmp = destination; tmp < dst; tmp++)
3492             if (*tmp == '\r') *tmp = '\n';
3493         }
3494     }
3495   if (coding->src_multibyte)
3496     dst = destination + str_as_unibyte (destination, dst - destination);
3497
3498   coding->consumed = src_base - source;
3499   coding->produced = dst - destination;
3500   coding->produced_char = coding->produced;
3501 }
3502
3503 \f
3504 /*** 7. C library functions ***/
3505
3506 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3507    has a property `coding-system'.  The value of this property is a
3508    vector of length 5 (called the coding-vector).  Among elements of
3509    this vector, the first (element[0]) and the fifth (element[4])
3510    carry important information for decoding/encoding.  Before
3511    decoding/encoding, this information should be set in fields of a
3512    structure of type `coding_system'.
3513
3514    The value of the property `coding-system' can be a symbol of another
3515    subsidiary coding-system.  In that case, Emacs gets coding-vector
3516    from that symbol.
3517
3518    `element[0]' contains information to be set in `coding->type'.  The
3519    value and its meaning is as follows:
3520
3521    0 -- coding_type_emacs_mule
3522    1 -- coding_type_sjis
3523    2 -- coding_type_iso2022
3524    3 -- coding_type_big5
3525    4 -- coding_type_ccl encoder/decoder written in CCL
3526    nil -- coding_type_no_conversion
3527    t -- coding_type_undecided (automatic conversion on decoding,
3528                                no-conversion on encoding)
3529
3530    `element[4]' contains information to be set in `coding->flags' and
3531    `coding->spec'.  The meaning varies by `coding->type'.
3532
3533    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3534    of length 32 (of which the first 13 sub-elements are used now).
3535    Meanings of these sub-elements are:
3536
3537    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3538         If the value is an integer of valid charset, the charset is
3539         assumed to be designated to graphic register N initially.
3540
3541         If the value is minus, it is a minus value of charset which
3542         reserves graphic register N, which means that the charset is
3543         not designated initially but should be designated to graphic
3544         register N just before encoding a character in that charset.
3545
3546         If the value is nil, graphic register N is never used on
3547         encoding.
3548
3549    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3550         Each value takes t or nil.  See the section ISO2022 of
3551         `coding.h' for more information.
3552
3553    If `coding->type' is `coding_type_big5', element[4] is t to denote
3554    BIG5-ETen or nil to denote BIG5-HKU.
3555
3556    If `coding->type' takes the other value, element[4] is ignored.
3557
3558    Emacs Lisp's coding systems also carry information about format of
3559    end-of-line in a value of property `eol-type'.  If the value is
3560    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3561    means CODING_EOL_CR.  If it is not integer, it should be a vector
3562    of subsidiary coding systems of which property `eol-type' has one
3563    of the above values.
3564
3565 */
3566
3567 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3568    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3569    is setup so that no conversion is necessary and return -1, else
3570    return 0.  */
3571
3572 int
3573 setup_coding_system (coding_system, coding)
3574      Lisp_Object coding_system;
3575      struct coding_system *coding;
3576 {
3577   Lisp_Object coding_spec, coding_type, eol_type, plist;
3578   Lisp_Object val;
3579
3580   /* At first, zero clear all members.  */
3581   bzero (coding, sizeof (struct coding_system));
3582
3583   /* Initialize some fields required for all kinds of coding systems.  */
3584   coding->symbol = coding_system;
3585   coding->heading_ascii = -1;
3586   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3587   coding->composing = COMPOSITION_DISABLED;
3588   coding->cmp_data = NULL;
3589
3590   if (NILP (coding_system))
3591     goto label_invalid_coding_system;
3592
3593   coding_spec = Fget (coding_system, Qcoding_system);
3594
3595   if (!VECTORP (coding_spec)
3596       || XVECTOR (coding_spec)->size != 5
3597       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3598     goto label_invalid_coding_system;
3599
3600   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3601   if (VECTORP (eol_type))
3602     {
3603       coding->eol_type = CODING_EOL_UNDECIDED;
3604       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3605       if (system_eol_type != CODING_EOL_LF)
3606         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3607     }
3608   else if (XFASTINT (eol_type) == 1)
3609     {
3610       coding->eol_type = CODING_EOL_CRLF;
3611       coding->common_flags
3612         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3613     }
3614   else if (XFASTINT (eol_type) == 2)
3615     {
3616       coding->eol_type = CODING_EOL_CR;
3617       coding->common_flags
3618         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3619     }
3620   else
3621     {
3622       coding->common_flags = 0;
3623       coding->eol_type = CODING_EOL_LF;
3624     }
3625
3626   coding_type = XVECTOR (coding_spec)->contents[0];
3627   /* Try short cut.  */
3628   if (SYMBOLP (coding_type))
3629     {
3630       if (EQ (coding_type, Qt))
3631         {
3632           coding->type = coding_type_undecided;
3633           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3634         }
3635       else
3636         coding->type = coding_type_no_conversion;
3637       /* Initialize this member.  Any thing other than
3638          CODING_CATEGORY_IDX_UTF_16_BE and
3639          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3640          special treatment in detect_eol.  */
3641       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3642
3643       return 0;
3644     }
3645
3646   /* Get values of coding system properties:
3647      `post-read-conversion', `pre-write-conversion',
3648      `translation-table-for-decode', `translation-table-for-encode'.  */
3649   plist = XVECTOR (coding_spec)->contents[3];
3650   /* Pre & post conversion functions should be disabled if
3651      inhibit_eol_conversion is nonzero.  This is the case that a code
3652      conversion function is called while those functions are running.  */
3653   if (! inhibit_pre_post_conversion)
3654     {
3655       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3656       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3657     }
3658   val = Fplist_get (plist, Qtranslation_table_for_decode);
3659   if (SYMBOLP (val))
3660     val = Fget (val, Qtranslation_table_for_decode);
3661   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3662   val = Fplist_get (plist, Qtranslation_table_for_encode);
3663   if (SYMBOLP (val))
3664     val = Fget (val, Qtranslation_table_for_encode);
3665   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3666   val = Fplist_get (plist, Qcoding_category);
3667   if (!NILP (val))
3668     {
3669       val = Fget (val, Qcoding_category_index);
3670       if (INTEGERP (val))
3671         coding->category_idx = XINT (val);
3672       else
3673         goto label_invalid_coding_system;
3674     }
3675   else
3676     goto label_invalid_coding_system;
3677
3678   /* If the coding system has non-nil `composition' property, enable
3679      composition handling.  */
3680   val = Fplist_get (plist, Qcomposition);
3681   if (!NILP (val))
3682     coding->composing = COMPOSITION_NO;
3683
3684   /* If the coding system is ascii-incompatible, record it in
3685      common_flags.   */
3686   val = Fplist_get (plist, Qascii_incompatible);
3687   if (! NILP (val))
3688     coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3689
3690   switch (XFASTINT (coding_type))
3691     {
3692     case 0:
3693       coding->type = coding_type_emacs_mule;
3694       coding->common_flags
3695         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3696       if (!NILP (coding->post_read_conversion))
3697         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3698       if (!NILP (coding->pre_write_conversion))
3699         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3700       break;
3701
3702     case 1:
3703       coding->type = coding_type_sjis;
3704       coding->common_flags
3705         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3706       break;
3707
3708     case 2:
3709       coding->type = coding_type_iso2022;
3710       coding->common_flags
3711         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3712       {
3713         Lisp_Object val, temp;
3714         Lisp_Object *flags;
3715         int i, charset, reg_bits = 0;
3716
3717         val = XVECTOR (coding_spec)->contents[4];
3718
3719         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3720           goto label_invalid_coding_system;
3721
3722         flags = XVECTOR (val)->contents;
3723         coding->flags
3724           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3725              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3726              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3727              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3728              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3729              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3730              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3731              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3732              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3733              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3734              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3735              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3736              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3737              );
3738
3739         /* Invoke graphic register 0 to plane 0.  */
3740         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3741         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3742         CODING_SPEC_ISO_INVOCATION (coding, 1)
3743           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3744         /* Not single shifting at first.  */
3745         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3746         /* Beginning of buffer should also be regarded as bol. */
3747         CODING_SPEC_ISO_BOL (coding) = 1;
3748
3749         for (charset = 0; charset <= MAX_CHARSET; charset++)
3750           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3751         val = Vcharset_revision_alist;
3752         while (CONSP (val))
3753           {
3754             charset = get_charset_id (Fcar_safe (XCAR (val)));
3755             if (charset >= 0
3756                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3757                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3758               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3759             val = XCDR (val);
3760           }
3761
3762         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3763            FLAGS[REG] can be one of below:
3764                 integer CHARSET: CHARSET occupies register I,
3765                 t: designate nothing to REG initially, but can be used
3766                   by any charsets,
3767                 list of integer, nil, or t: designate the first
3768                   element (if integer) to REG initially, the remaining
3769                   elements (if integer) is designated to REG on request,
3770                   if an element is t, REG can be used by any charsets,
3771                 nil: REG is never used.  */
3772         for (charset = 0; charset <= MAX_CHARSET; charset++)
3773           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3774             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3775         for (i = 0; i < 4; i++)
3776           {
3777             if ((INTEGERP (flags[i])
3778                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3779                 || (charset = get_charset_id (flags[i])) >= 0)
3780               {
3781                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3782                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3783               }
3784             else if (EQ (flags[i], Qt))
3785               {
3786                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3787                 reg_bits |= 1 << i;
3788                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3789               }
3790             else if (CONSP (flags[i]))
3791               {
3792                 Lisp_Object tail;
3793                 tail = flags[i];
3794
3795                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3796                 if ((INTEGERP (XCAR (tail))
3797                      && (charset = XINT (XCAR (tail)),
3798                          CHARSET_VALID_P (charset)))
3799                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3800                   {
3801                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3802                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3803                   }
3804                 else
3805                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3806                 tail = XCDR (tail);
3807                 while (CONSP (tail))
3808                   {
3809                     if ((INTEGERP (XCAR (tail))
3810                          && (charset = XINT (XCAR (tail)),
3811                              CHARSET_VALID_P (charset)))
3812                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3813                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3814                         = i;
3815                     else if (EQ (XCAR (tail), Qt))
3816                       reg_bits |= 1 << i;
3817                     tail = XCDR (tail);
3818                   }
3819               }
3820             else
3821               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3822
3823             CODING_SPEC_ISO_DESIGNATION (coding, i)
3824               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3825           }
3826
3827         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3828           {
3829             /* REG 1 can be used only by locking shift in 7-bit env.  */
3830             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3831               reg_bits &= ~2;
3832             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3833               /* Without any shifting, only REG 0 and 1 can be used.  */
3834               reg_bits &= 3;
3835           }
3836
3837         if (reg_bits)
3838           for (charset = 0; charset <= MAX_CHARSET; charset++)
3839             {
3840               if (CHARSET_DEFINED_P (charset)
3841                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3842                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3843                 {
3844                   /* There exist some default graphic registers to be
3845                      used by CHARSET.  */
3846
3847                   /* We had better avoid designating a charset of
3848                      CHARS96 to REG 0 as far as possible.  */
3849                   if (CHARSET_CHARS (charset) == 96)
3850                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3851                       = (reg_bits & 2
3852                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3853                   else
3854                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3855                       = (reg_bits & 1
3856                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3857                 }
3858             }
3859       }
3860       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3861       coding->spec.iso2022.last_invalid_designation_register = -1;
3862       break;
3863
3864     case 3:
3865       coding->type = coding_type_big5;
3866       coding->common_flags
3867         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3868       coding->flags
3869         = (NILP (XVECTOR (coding_spec)->contents[4])
3870            ? CODING_FLAG_BIG5_HKU
3871            : CODING_FLAG_BIG5_ETEN);
3872       break;
3873
3874     case 4:
3875       coding->type = coding_type_ccl;
3876       coding->common_flags
3877         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3878       {
3879         val = XVECTOR (coding_spec)->contents[4];
3880         if (! CONSP (val)
3881             || setup_ccl_program (&(coding->spec.ccl.decoder),
3882                                   XCAR (val)) < 0
3883             || setup_ccl_program (&(coding->spec.ccl.encoder),
3884                                   XCDR (val)) < 0)
3885           goto label_invalid_coding_system;
3886
3887         bzero (coding->spec.ccl.valid_codes, 256);
3888         val = Fplist_get (plist, Qvalid_codes);
3889         if (CONSP (val))
3890           {
3891             Lisp_Object this;
3892
3893             for (; CONSP (val); val = XCDR (val))
3894               {
3895                 this = XCAR (val);
3896                 if (INTEGERP (this)
3897                     && XINT (this) >= 0 && XINT (this) < 256)
3898                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3899                 else if (CONSP (this)
3900                          && INTEGERP (XCAR (this))
3901                          && INTEGERP (XCDR (this)))
3902                   {
3903                     int start = XINT (XCAR (this));
3904                     int end = XINT (XCDR (this));
3905
3906                     if (start >= 0 && start <= end && end < 256)
3907                       while (start <= end)
3908                         coding->spec.ccl.valid_codes[start++] = 1;
3909                   }
3910               }
3911           }
3912       }
3913       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3914       coding->spec.ccl.cr_carryover = 0;
3915       coding->spec.ccl.eight_bit_carryover[0] = 0;
3916       break;
3917
3918     case 5:
3919       coding->type = coding_type_raw_text;
3920       break;
3921
3922     default:
3923       goto label_invalid_coding_system;
3924     }
3925   return 0;
3926
3927  label_invalid_coding_system:
3928   coding->type = coding_type_no_conversion;
3929   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3930   coding->common_flags = 0;
3931   coding->eol_type = CODING_EOL_UNDECIDED;
3932   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3933   return NILP (coding_system) ? 0 : -1;
3934 }
3935
3936 /* Free memory blocks allocated for storing composition information.  */
3937
3938 void
3939 coding_free_composition_data (coding)
3940      struct coding_system *coding;
3941 {
3942   struct composition_data *cmp_data = coding->cmp_data, *next;
3943
3944   if (!cmp_data)
3945     return;
3946   /* Memory blocks are chained.  At first, rewind to the first, then,
3947      free blocks one by one.  */
3948   while (cmp_data->prev)
3949     cmp_data = cmp_data->prev;
3950   while (cmp_data)
3951     {
3952       next = cmp_data->next;
3953       xfree (cmp_data);
3954       cmp_data = next;
3955     }
3956   coding->cmp_data = NULL;
3957 }
3958
3959 /* Set `char_offset' member of all memory blocks pointed by
3960    coding->cmp_data to POS.  */
3961
3962 void
3963 coding_adjust_composition_offset (coding, pos)
3964      struct coding_system *coding;
3965      int pos;
3966 {
3967   struct composition_data *cmp_data;
3968
3969   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3970     cmp_data->char_offset = pos;
3971 }
3972
3973 /* Setup raw-text or one of its subsidiaries in the structure
3974    coding_system CODING according to the already setup value eol_type
3975    in CODING.  CODING should be setup for some coding system in
3976    advance.  */
3977
3978 void
3979 setup_raw_text_coding_system (coding)
3980      struct coding_system *coding;
3981 {
3982   if (coding->type != coding_type_raw_text)
3983     {
3984       coding->symbol = Qraw_text;
3985       coding->type = coding_type_raw_text;
3986       if (coding->eol_type != CODING_EOL_UNDECIDED)
3987         {
3988           Lisp_Object subsidiaries;
3989           subsidiaries = Fget (Qraw_text, Qeol_type);
3990
3991           if (VECTORP (subsidiaries)
3992               && XVECTOR (subsidiaries)->size == 3)
3993             coding->symbol
3994               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3995         }
3996       setup_coding_system (coding->symbol, coding);
3997     }
3998   return;
3999 }
4000
4001 /* Emacs has a mechanism to automatically detect a coding system if it
4002    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4003    it's impossible to distinguish some coding systems accurately
4004    because they use the same range of codes.  So, at first, coding
4005    systems are categorized into 7, those are:
4006
4007    o coding-category-emacs-mule
4008
4009         The category for a coding system which has the same code range
4010         as Emacs' internal format.  Assigned the coding-system (Lisp
4011         symbol) `emacs-mule' by default.
4012
4013    o coding-category-sjis
4014
4015         The category for a coding system which has the same code range
4016         as SJIS.  Assigned the coding-system (Lisp
4017         symbol) `japanese-shift-jis' by default.
4018
4019    o coding-category-iso-7
4020
4021         The category for a coding system which has the same code range
4022         as ISO2022 of 7-bit environment.  This doesn't use any locking
4023         shift and single shift functions.  This can encode/decode all
4024         charsets.  Assigned the coding-system (Lisp symbol)
4025         `iso-2022-7bit' by default.
4026
4027    o coding-category-iso-7-tight
4028
4029         Same as coding-category-iso-7 except that this can
4030         encode/decode only the specified charsets.
4031
4032    o coding-category-iso-8-1
4033
4034         The category for a coding system which has the same code range
4035         as ISO2022 of 8-bit environment and graphic plane 1 used only
4036         for DIMENSION1 charset.  This doesn't use any locking shift
4037         and single shift functions.  Assigned the coding-system (Lisp
4038         symbol) `iso-latin-1' by default.
4039
4040    o coding-category-iso-8-2
4041
4042         The category for a coding system which has the same code range
4043         as ISO2022 of 8-bit environment and graphic plane 1 used only
4044         for DIMENSION2 charset.  This doesn't use any locking shift
4045         and single shift functions.  Assigned the coding-system (Lisp
4046         symbol) `japanese-iso-8bit' by default.
4047
4048    o coding-category-iso-7-else
4049
4050         The category for a coding system which has the same code range
4051         as ISO2022 of 7-bit environment but uses locking shift or
4052         single shift functions.  Assigned the coding-system (Lisp
4053         symbol) `iso-2022-7bit-lock' by default.
4054
4055    o coding-category-iso-8-else
4056
4057         The category for a coding system which has the same code range
4058         as ISO2022 of 8-bit environment but uses locking shift or
4059         single shift functions.  Assigned the coding-system (Lisp
4060         symbol) `iso-2022-8bit-ss2' by default.
4061
4062    o coding-category-big5
4063
4064         The category for a coding system which has the same code range
4065         as BIG5.  Assigned the coding-system (Lisp symbol)
4066         `cn-big5' by default.
4067
4068    o coding-category-utf-8
4069
4070         The category for a coding system which has the same code range
4071         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4072         symbol) `utf-8' by default.
4073
4074    o coding-category-utf-16-be
4075
4076         The category for a coding system in which a text has an
4077         Unicode signature (cf. Unicode Standard) in the order of BIG
4078         endian at the head.  Assigned the coding-system (Lisp symbol)
4079         `utf-16-be' by default.
4080
4081    o coding-category-utf-16-le
4082
4083         The category for a coding system in which a text has an
4084         Unicode signature (cf. Unicode Standard) in the order of
4085         LITTLE endian at the head.  Assigned the coding-system (Lisp
4086         symbol) `utf-16-le' by default.
4087
4088    o coding-category-ccl
4089
4090         The category for a coding system of which encoder/decoder is
4091         written in CCL programs.  The default value is nil, i.e., no
4092         coding system is assigned.
4093
4094    o coding-category-binary
4095
4096         The category for a coding system not categorized in any of the
4097         above.  Assigned the coding-system (Lisp symbol)
4098         `no-conversion' by default.
4099
4100    Each of them is a Lisp symbol and the value is an actual
4101    `coding-system' (this is also a Lisp symbol) assigned by a user.
4102    What Emacs does actually is to detect a category of coding system.
4103    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4104    decide a single possible category, it selects a category of the
4105    highest priority.  Priorities of categories are also specified by a
4106    user in a Lisp variable `coding-category-list'.
4107
4108 */
4109
4110 static
4111 int ascii_skip_code[256];
4112
4113 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4114    If it detects possible coding systems, return an integer in which
4115    appropriate flag bits are set.  Flag bits are defined by macros
4116    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4117    it should point the table `coding_priorities'.  In that case, only
4118    the flag bit for a coding system of the highest priority is set in
4119    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4120    range 0x80..0x9F are in multibyte form.
4121
4122    How many ASCII characters are at the head is returned as *SKIP.  */
4123
4124 static int
4125 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4126      unsigned char *source;
4127      int src_bytes, *priorities, *skip;
4128      int multibytep;
4129 {
4130   register unsigned char c;
4131   unsigned char *src = source, *src_end = source + src_bytes;
4132   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4133   int i;
4134
4135   /* At first, skip all ASCII characters and control characters except
4136      for three ISO2022 specific control characters.  */
4137   ascii_skip_code[ISO_CODE_SO] = 0;
4138   ascii_skip_code[ISO_CODE_SI] = 0;
4139   ascii_skip_code[ISO_CODE_ESC] = 0;
4140
4141  label_loop_detect_coding:
4142   while (src < src_end && ascii_skip_code[*src]) src++;
4143   *skip = src - source;
4144
4145   if (src >= src_end)
4146     /* We found nothing other than ASCII.  There's nothing to do.  */
4147     return 0;
4148
4149   c = *src;
4150   /* The text seems to be encoded in some multilingual coding system.
4151      Now, try to find in which coding system the text is encoded.  */
4152   if (c < 0x80)
4153     {
4154       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4155       /* C is an ISO2022 specific control code of C0.  */
4156       mask = detect_coding_iso2022 (src, src_end, multibytep);
4157       if (mask == 0)
4158         {
4159           /* No valid ISO2022 code follows C.  Try again.  */
4160           src++;
4161           if (c == ISO_CODE_ESC)
4162             ascii_skip_code[ISO_CODE_ESC] = 1;
4163           else
4164             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4165           goto label_loop_detect_coding;
4166         }
4167       if (priorities)
4168         {
4169           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4170             {
4171               if (mask & priorities[i])
4172                 return priorities[i];
4173             }
4174           return CODING_CATEGORY_MASK_RAW_TEXT;
4175         }
4176     }
4177   else
4178     {
4179       int try;
4180
4181       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4182         c = src[1] - 0x20;
4183
4184       if (c < 0xA0)
4185         {
4186           /* C is the first byte of SJIS character code,
4187              or a leading-code of Emacs' internal format (emacs-mule),
4188              or the first byte of UTF-16.  */
4189           try = (CODING_CATEGORY_MASK_SJIS
4190                   | CODING_CATEGORY_MASK_EMACS_MULE
4191                   | CODING_CATEGORY_MASK_UTF_16_BE
4192                   | CODING_CATEGORY_MASK_UTF_16_LE);
4193
4194           /* Or, if C is a special latin extra code,
4195              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4196              or is an ISO2022 control-sequence-introducer (CSI),
4197              we should also consider the possibility of ISO2022 codings.  */
4198           if ((VECTORP (Vlatin_extra_code_table)
4199                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4200               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4201               || (c == ISO_CODE_CSI
4202                   && (src < src_end
4203                       && (*src == ']'
4204                           || ((*src == '0' || *src == '1' || *src == '2')
4205                               && src + 1 < src_end
4206                               && src[1] == ']')))))
4207             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4208                      | CODING_CATEGORY_MASK_ISO_8BIT);
4209         }
4210       else
4211         /* C is a character of ISO2022 in graphic plane right,
4212            or a SJIS's 1-byte character code (i.e. JISX0201),
4213            or the first byte of BIG5's 2-byte code,
4214            or the first byte of UTF-8/16.  */
4215         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4216                 | CODING_CATEGORY_MASK_ISO_8BIT
4217                 | CODING_CATEGORY_MASK_SJIS
4218                 | CODING_CATEGORY_MASK_BIG5
4219                 | CODING_CATEGORY_MASK_UTF_8
4220                 | CODING_CATEGORY_MASK_UTF_16_BE
4221                 | CODING_CATEGORY_MASK_UTF_16_LE);
4222
4223       /* Or, we may have to consider the possibility of CCL.  */
4224       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4225           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4226               ->spec.ccl.valid_codes)[c])
4227         try |= CODING_CATEGORY_MASK_CCL;
4228
4229       mask = 0;
4230       utf16_examined_p = iso2022_examined_p = 0;
4231       if (priorities)
4232         {
4233           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4234             {
4235               if (!iso2022_examined_p
4236                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4237                 {
4238                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4239                   iso2022_examined_p = 1;
4240                 }
4241               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4242                 mask |= detect_coding_sjis (src, src_end, multibytep);
4243               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4244                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4245               else if (!utf16_examined_p
4246                        && (priorities[i] & try &
4247                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4248                 {
4249                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4250                   utf16_examined_p = 1;
4251                 }
4252               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4253                 mask |= detect_coding_big5 (src, src_end, multibytep);
4254               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4255                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4256               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4257                 mask |= detect_coding_ccl (src, src_end, multibytep);
4258               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4259                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4260               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4261                 mask |= CODING_CATEGORY_MASK_BINARY;
4262               if (mask & priorities[i])
4263                 return priorities[i];
4264             }
4265           return CODING_CATEGORY_MASK_RAW_TEXT;
4266         }
4267       if (try & CODING_CATEGORY_MASK_ISO)
4268         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4269       if (try & CODING_CATEGORY_MASK_SJIS)
4270         mask |= detect_coding_sjis (src, src_end, multibytep);
4271       if (try & CODING_CATEGORY_MASK_BIG5)
4272         mask |= detect_coding_big5 (src, src_end, multibytep);
4273       if (try & CODING_CATEGORY_MASK_UTF_8)
4274         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4275       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4276         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4277       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4278         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4279       if (try & CODING_CATEGORY_MASK_CCL)
4280         mask |= detect_coding_ccl (src, src_end, multibytep);
4281     }
4282   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4283 }
4284
4285 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4286    The information of the detected coding system is set in CODING.  */
4287
4288 void
4289 detect_coding (coding, src, src_bytes)
4290      struct coding_system *coding;
4291      const unsigned char *src;
4292      int src_bytes;
4293 {
4294   unsigned int idx;
4295   int skip, mask;
4296   Lisp_Object val;
4297
4298   val = Vcoding_category_list;
4299   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4300                              coding->src_multibyte);
4301   coding->heading_ascii = skip;
4302
4303   if (!mask) return;
4304
4305   /* We found a single coding system of the highest priority in MASK.  */
4306   idx = 0;
4307   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4308   if (! mask)
4309     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4310
4311   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4312
4313   if (coding->eol_type != CODING_EOL_UNDECIDED)
4314     {
4315       Lisp_Object tmp;
4316
4317       tmp = Fget (val, Qeol_type);
4318       if (VECTORP (tmp))
4319         val = XVECTOR (tmp)->contents[coding->eol_type];
4320     }
4321
4322   /* Setup this new coding system while preserving some slots.  */
4323   {
4324     int src_multibyte = coding->src_multibyte;
4325     int dst_multibyte = coding->dst_multibyte;
4326
4327     setup_coding_system (val, coding);
4328     coding->src_multibyte = src_multibyte;
4329     coding->dst_multibyte = dst_multibyte;
4330     coding->heading_ascii = skip;
4331   }
4332 }
4333
4334 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4335    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4336    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4337
4338    How many non-eol characters are at the head is returned as *SKIP.  */
4339
4340 #define MAX_EOL_CHECK_COUNT 3
4341
4342 static int
4343 detect_eol_type (source, src_bytes, skip)
4344      unsigned char *source;
4345      int src_bytes, *skip;
4346 {
4347   unsigned char *src = source, *src_end = src + src_bytes;
4348   unsigned char c;
4349   int total = 0;                /* How many end-of-lines are found so far.  */
4350   int eol_type = CODING_EOL_UNDECIDED;
4351   int this_eol_type;
4352
4353   *skip = 0;
4354
4355   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4356     {
4357       c = *src++;
4358       if (c == '\n' || c == '\r')
4359         {
4360           if (*skip == 0)
4361             *skip = src - 1 - source;
4362           total++;
4363           if (c == '\n')
4364             this_eol_type = CODING_EOL_LF;
4365           else if (src >= src_end || *src != '\n')
4366             this_eol_type = CODING_EOL_CR;
4367           else
4368             this_eol_type = CODING_EOL_CRLF, src++;
4369
4370           if (eol_type == CODING_EOL_UNDECIDED)
4371             /* This is the first end-of-line.  */
4372             eol_type = this_eol_type;
4373           else if (eol_type != this_eol_type)
4374             {
4375               /* The found type is different from what found before.  */
4376               eol_type = CODING_EOL_INCONSISTENT;
4377               break;
4378             }
4379         }
4380     }
4381
4382   if (*skip == 0)
4383     *skip = src_end - source;
4384   return eol_type;
4385 }
4386
4387 /* Like detect_eol_type, but detect EOL type in 2-octet
4388    big-endian/little-endian format for coding systems utf-16-be and
4389    utf-16-le.  */
4390
4391 static int
4392 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4393      unsigned char *source;
4394      int src_bytes, *skip, big_endian_p;
4395 {
4396   unsigned char *src = source, *src_end = src + src_bytes;
4397   unsigned int c1, c2;
4398   int total = 0;                /* How many end-of-lines are found so far.  */
4399   int eol_type = CODING_EOL_UNDECIDED;
4400   int this_eol_type;
4401   int msb, lsb;
4402
4403   if (big_endian_p)
4404     msb = 0, lsb = 1;
4405   else
4406     msb = 1, lsb = 0;
4407
4408   *skip = 0;
4409
4410   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4411     {
4412       c1 = (src[msb] << 8) | (src[lsb]);
4413       src += 2;
4414
4415       if (c1 == '\n' || c1 == '\r')
4416         {
4417           if (*skip == 0)
4418             *skip = src - 2 - source;
4419           total++;
4420           if (c1 == '\n')
4421             {
4422               this_eol_type = CODING_EOL_LF;
4423             }
4424           else
4425             {
4426               if ((src + 1) >= src_end)
4427                 {
4428                   this_eol_type = CODING_EOL_CR;
4429                 }
4430               else
4431                 {
4432                   c2 = (src[msb] << 8) | (src[lsb]);
4433                   if (c2 == '\n')
4434                     this_eol_type = CODING_EOL_CRLF, src += 2;
4435                   else
4436                     this_eol_type = CODING_EOL_CR;
4437                 }
4438             }
4439
4440           if (eol_type == CODING_EOL_UNDECIDED)
4441             /* This is the first end-of-line.  */
4442             eol_type = this_eol_type;
4443           else if (eol_type != this_eol_type)
4444             {
4445               /* The found type is different from what found before.  */
4446               eol_type = CODING_EOL_INCONSISTENT;
4447               break;
4448             }
4449         }
4450     }
4451
4452   if (*skip == 0)
4453     *skip = src_end - source;
4454   return eol_type;
4455 }
4456
4457 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4458    is encoded.  If it detects an appropriate format of end-of-line, it
4459    sets the information in *CODING.  */
4460
4461 void
4462 detect_eol (coding, src, src_bytes)
4463      struct coding_system *coding;
4464      const unsigned char *src;
4465      int src_bytes;
4466 {
4467   Lisp_Object val;
4468   int skip;
4469   int eol_type;
4470
4471   switch (coding->category_idx)
4472     {
4473     case CODING_CATEGORY_IDX_UTF_16_BE:
4474       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4475       break;
4476     case CODING_CATEGORY_IDX_UTF_16_LE:
4477       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4478       break;
4479     default:
4480       eol_type = detect_eol_type (src, src_bytes, &skip);
4481       break;
4482     }
4483
4484   if (coding->heading_ascii > skip)
4485     coding->heading_ascii = skip;
4486   else
4487     skip = coding->heading_ascii;
4488
4489   if (eol_type == CODING_EOL_UNDECIDED)
4490     return;
4491   if (eol_type == CODING_EOL_INCONSISTENT)
4492     {
4493 #if 0
4494       /* This code is suppressed until we find a better way to
4495          distinguish raw text file and binary file.  */
4496
4497       /* If we have already detected that the coding is raw-text, the
4498          coding should actually be no-conversion.  */
4499       if (coding->type == coding_type_raw_text)
4500         {
4501           setup_coding_system (Qno_conversion, coding);
4502           return;
4503         }
4504       /* Else, let's decode only text code anyway.  */
4505 #endif /* 0 */
4506       eol_type = CODING_EOL_LF;
4507     }
4508
4509   val = Fget (coding->symbol, Qeol_type);
4510   if (VECTORP (val) && XVECTOR (val)->size == 3)
4511     {
4512       int src_multibyte = coding->src_multibyte;
4513       int dst_multibyte = coding->dst_multibyte;
4514       struct composition_data *cmp_data = coding->cmp_data;
4515
4516       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4517       coding->src_multibyte = src_multibyte;
4518       coding->dst_multibyte = dst_multibyte;
4519       coding->heading_ascii = skip;
4520       coding->cmp_data = cmp_data;
4521     }
4522 }
4523
4524 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4525
4526 #define DECODING_BUFFER_MAG(coding)                     \
4527   (coding->type == coding_type_iso2022                  \
4528    ? 3                                                  \
4529    : (coding->type == coding_type_ccl                   \
4530       ? coding->spec.ccl.decoder.buf_magnification      \
4531       : 2))
4532
4533 /* Return maximum size (bytes) of a buffer enough for decoding
4534    SRC_BYTES of text encoded in CODING.  */
4535
4536 int
4537 decoding_buffer_size (coding, src_bytes)
4538      struct coding_system *coding;
4539      int src_bytes;
4540 {
4541   return (src_bytes * DECODING_BUFFER_MAG (coding)
4542           + CONVERSION_BUFFER_EXTRA_ROOM);
4543 }
4544
4545 /* Return maximum size (bytes) of a buffer enough for encoding
4546    SRC_BYTES of text to CODING.  */
4547
4548 int
4549 encoding_buffer_size (coding, src_bytes)
4550      struct coding_system *coding;
4551      int src_bytes;
4552 {
4553   int magnification;
4554
4555   if (coding->type == coding_type_ccl)
4556     {
4557       magnification = coding->spec.ccl.encoder.buf_magnification;
4558       if (coding->eol_type == CODING_EOL_CRLF)
4559         magnification *= 2;
4560     }
4561   else if (CODING_REQUIRE_ENCODING (coding))
4562     magnification = 3;
4563   else
4564     magnification = 1;
4565
4566   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4567 }
4568
4569 /* Working buffer for code conversion.  */
4570 struct conversion_buffer
4571 {
4572   int size;                     /* size of data.  */
4573   int on_stack;                 /* 1 if allocated by alloca.  */
4574   unsigned char *data;
4575 };
4576
4577 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4578 #define allocate_conversion_buffer(buf, len)            \
4579   do {                                                  \
4580     if (len < MAX_ALLOCA)                               \
4581       {                                                 \
4582         buf.data = (unsigned char *) alloca (len);      \
4583         buf.on_stack = 1;                               \
4584       }                                                 \
4585     else                                                \
4586       {                                                 \
4587         buf.data = (unsigned char *) xmalloc (len);     \
4588         buf.on_stack = 0;                               \
4589       }                                                 \
4590     buf.size = len;                                     \
4591   } while (0)
4592
4593 /* Double the allocated memory for *BUF.  */
4594 static void
4595 extend_conversion_buffer (buf)
4596      struct conversion_buffer *buf;
4597 {
4598   if (buf->on_stack)
4599     {
4600       unsigned char *save = buf->data;
4601       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4602       bcopy (save, buf->data, buf->size);
4603       buf->on_stack = 0;
4604     }
4605   else
4606     {
4607       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4608     }
4609   buf->size *= 2;
4610 }
4611
4612 /* Free the allocated memory for BUF if it is not on stack.  */
4613 static void
4614 free_conversion_buffer (buf)
4615      struct conversion_buffer *buf;
4616 {
4617   if (!buf->on_stack)
4618     xfree (buf->data);
4619 }
4620
4621 int
4622 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4623      struct coding_system *coding;
4624      unsigned char *source, *destination;
4625      int src_bytes, dst_bytes, encodep;
4626 {
4627   struct ccl_program *ccl
4628     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4629   unsigned char *dst = destination;
4630
4631   ccl->suppress_error = coding->suppress_error;
4632   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4633   if (encodep)
4634     {
4635       /* On encoding, EOL format is converted within ccl_driver.  For
4636          that, setup proper information in the structure CCL.  */
4637       ccl->eol_type = coding->eol_type;
4638       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4639         ccl->eol_type = CODING_EOL_LF;
4640       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4641       ccl->eight_bit_control = coding->dst_multibyte;
4642     }
4643   else
4644     ccl->eight_bit_control = 1;
4645   ccl->multibyte = coding->src_multibyte;
4646   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4647     {
4648       /* Move carryover bytes to DESTINATION.  */
4649       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4650       while (*p)
4651         *dst++ = *p++;
4652       coding->spec.ccl.eight_bit_carryover[0] = 0;
4653       if (dst_bytes)
4654         dst_bytes -= dst - destination;
4655     }
4656
4657   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4658                                   &(coding->consumed))
4659                       + dst - destination);
4660
4661   if (encodep)
4662     {
4663       coding->produced_char = coding->produced;
4664       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4665     }
4666   else if (!ccl->eight_bit_control)
4667     {
4668       /* The produced bytes forms a valid multibyte sequence. */
4669       coding->produced_char
4670         = multibyte_chars_in_text (destination, coding->produced);
4671       coding->spec.ccl.eight_bit_carryover[0] = 0;
4672     }
4673   else
4674     {
4675       /* On decoding, the destination should always multibyte.  But,
4676          CCL program might have been generated an invalid multibyte
4677          sequence.  Here we make such a sequence valid as
4678          multibyte.  */
4679       int bytes
4680         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4681
4682       if ((coding->consumed < src_bytes
4683            || !ccl->last_block)
4684           && coding->produced >= 1
4685           && destination[coding->produced - 1] >= 0x80)
4686         {
4687           /* We should not convert the tailing 8-bit codes to
4688              multibyte form even if they doesn't form a valid
4689              multibyte sequence.  They may form a valid sequence in
4690              the next call.  */
4691           int carryover = 0;
4692
4693           if (destination[coding->produced - 1] < 0xA0)
4694             carryover = 1;
4695           else if (coding->produced >= 2)
4696             {
4697               if (destination[coding->produced - 2] >= 0x80)
4698                 {
4699                   if (destination[coding->produced - 2] < 0xA0)
4700                     carryover = 2;
4701                   else if (coding->produced >= 3
4702                            && destination[coding->produced - 3] >= 0x80
4703                            && destination[coding->produced - 3] < 0xA0)
4704                     carryover = 3;
4705                 }
4706             }
4707           if (carryover > 0)
4708             {
4709               BCOPY_SHORT (destination + coding->produced - carryover,
4710                            coding->spec.ccl.eight_bit_carryover,
4711                            carryover);
4712               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4713               coding->produced -= carryover;
4714             }
4715         }
4716       coding->produced = str_as_multibyte (destination, bytes,
4717                                            coding->produced,
4718                                            &(coding->produced_char));
4719     }
4720
4721   switch (ccl->status)
4722     {
4723     case CCL_STAT_SUSPEND_BY_SRC:
4724       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4725       break;
4726     case CCL_STAT_SUSPEND_BY_DST:
4727       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4728       break;
4729     case CCL_STAT_QUIT:
4730     case CCL_STAT_INVALID_CMD:
4731       coding->result = CODING_FINISH_INTERRUPT;
4732       break;
4733     default:
4734       coding->result = CODING_FINISH_NORMAL;
4735       break;
4736     }
4737   return coding->result;
4738 }
4739
4740 /* Decode EOL format of the text at PTR of BYTES length destructively
4741    according to CODING->eol_type.  This is called after the CCL
4742    program produced a decoded text at PTR.  If we do CRLF->LF
4743    conversion, update CODING->produced and CODING->produced_char.  */
4744
4745 static void
4746 decode_eol_post_ccl (coding, ptr, bytes)
4747      struct coding_system *coding;
4748      unsigned char *ptr;
4749      int bytes;
4750 {
4751   Lisp_Object val, saved_coding_symbol;
4752   unsigned char *pend = ptr + bytes;
4753   int dummy;
4754
4755   /* Remember the current coding system symbol.  We set it back when
4756      an inconsistent EOL is found so that `last-coding-system-used' is
4757      set to the coding system that doesn't specify EOL conversion.  */
4758   saved_coding_symbol = coding->symbol;
4759
4760   coding->spec.ccl.cr_carryover = 0;
4761   if (coding->eol_type == CODING_EOL_UNDECIDED)
4762     {
4763       /* Here, to avoid the call of setup_coding_system, we directly
4764          call detect_eol_type.  */
4765       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4766       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4767         coding->eol_type = CODING_EOL_LF;
4768       if (coding->eol_type != CODING_EOL_UNDECIDED)
4769         {
4770           val = Fget (coding->symbol, Qeol_type);
4771           if (VECTORP (val) && XVECTOR (val)->size == 3)
4772             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4773         }
4774       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4775     }
4776
4777   if (coding->eol_type == CODING_EOL_LF
4778       || coding->eol_type == CODING_EOL_UNDECIDED)
4779     {
4780       /* We have nothing to do.  */
4781       ptr = pend;
4782     }
4783   else if (coding->eol_type == CODING_EOL_CRLF)
4784     {
4785       unsigned char *pstart = ptr, *p = ptr;
4786
4787       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4788           && *(pend - 1) == '\r')
4789         {
4790           /* If the last character is CR, we can't handle it here
4791              because LF will be in the not-yet-decoded source text.
4792              Record that the CR is not yet processed.  */
4793           coding->spec.ccl.cr_carryover = 1;
4794           coding->produced--;
4795           coding->produced_char--;
4796           pend--;
4797         }
4798       while (ptr < pend)
4799         {
4800           if (*ptr == '\r')
4801             {
4802               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4803                 {
4804                   *p++ = '\n';
4805                   ptr += 2;
4806                 }
4807               else
4808                 {
4809                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4810                     goto undo_eol_conversion;
4811                   *p++ = *ptr++;
4812                 }
4813             }
4814           else if (*ptr == '\n'
4815                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4816             goto undo_eol_conversion;
4817           else
4818             *p++ = *ptr++;
4819           continue;
4820
4821         undo_eol_conversion:
4822           /* We have faced with inconsistent EOL format at PTR.
4823              Convert all LFs before PTR back to CRLFs.  */
4824           for (p--, ptr--; p >= pstart; p--)
4825             {
4826               if (*p == '\n')
4827                 *ptr-- = '\n', *ptr-- = '\r';
4828               else
4829                 *ptr-- = *p;
4830             }
4831           /*  If carryover is recorded, cancel it because we don't
4832               convert CRLF anymore.  */
4833           if (coding->spec.ccl.cr_carryover)
4834             {
4835               coding->spec.ccl.cr_carryover = 0;
4836               coding->produced++;
4837               coding->produced_char++;
4838               pend++;
4839             }
4840           p = ptr = pend;
4841           coding->eol_type = CODING_EOL_LF;
4842           coding->symbol = saved_coding_symbol;
4843         }
4844       if (p < pend)
4845         {
4846           /* As each two-byte sequence CRLF was converted to LF, (PEND
4847              - P) is the number of deleted characters.  */
4848           coding->produced -= pend - p;
4849           coding->produced_char -= pend - p;
4850         }
4851     }
4852   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4853     {
4854       unsigned char *p = ptr;
4855
4856       for (; ptr < pend; ptr++)
4857         {
4858           if (*ptr == '\r')
4859             *ptr = '\n';
4860           else if (*ptr == '\n'
4861                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4862             {
4863               for (; p < ptr; p++)
4864                 {
4865                   if (*p == '\n')
4866                     *p = '\r';
4867                 }
4868               ptr = pend;
4869               coding->eol_type = CODING_EOL_LF;
4870               coding->symbol = saved_coding_symbol;
4871             }
4872         }
4873     }
4874 }
4875
4876 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4877    decoding, it may detect coding system and format of end-of-line if
4878    those are not yet decided.  The source should be unibyte, the
4879    result is multibyte if CODING->dst_multibyte is nonzero, else
4880    unibyte.  */
4881
4882 int
4883 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4884      struct coding_system *coding;
4885      const unsigned char *source;
4886      unsigned char *destination;
4887      int src_bytes, dst_bytes;
4888 {
4889   int extra = 0;
4890
4891   if (coding->type == coding_type_undecided)
4892     detect_coding (coding, source, src_bytes);
4893
4894   if (coding->eol_type == CODING_EOL_UNDECIDED
4895       && coding->type != coding_type_ccl)
4896     {
4897       detect_eol (coding, source, src_bytes);
4898       /* We had better recover the original eol format if we
4899          encounter an inconsistent eol format while decoding.  */
4900       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4901     }
4902
4903   coding->produced = coding->produced_char = 0;
4904   coding->consumed = coding->consumed_char = 0;
4905   coding->errors = 0;
4906   coding->result = CODING_FINISH_NORMAL;
4907
4908   switch (coding->type)
4909     {
4910     case coding_type_sjis:
4911       decode_coding_sjis_big5 (coding, source, destination,
4912                                src_bytes, dst_bytes, 1);
4913       break;
4914
4915     case coding_type_iso2022:
4916       decode_coding_iso2022 (coding, source, destination,
4917                              src_bytes, dst_bytes);
4918       break;
4919
4920     case coding_type_big5:
4921       decode_coding_sjis_big5 (coding, source, destination,
4922                                src_bytes, dst_bytes, 0);
4923       break;
4924
4925     case coding_type_emacs_mule:
4926       decode_coding_emacs_mule (coding, source, destination,
4927                                 src_bytes, dst_bytes);
4928       break;
4929
4930     case coding_type_ccl:
4931       if (coding->spec.ccl.cr_carryover)
4932         {
4933           /* Put the CR which was not processed by the previous call
4934              of decode_eol_post_ccl in DESTINATION.  It will be
4935              decoded together with the following LF by the call to
4936              decode_eol_post_ccl below.  */
4937           *destination = '\r';
4938           coding->produced++;
4939           coding->produced_char++;
4940           dst_bytes--;
4941           extra = coding->spec.ccl.cr_carryover;
4942         }
4943       ccl_coding_driver (coding, source, destination + extra,
4944                          src_bytes, dst_bytes, 0);
4945       if (coding->eol_type != CODING_EOL_LF)
4946         {
4947           coding->produced += extra;
4948           coding->produced_char += extra;
4949           decode_eol_post_ccl (coding, destination, coding->produced);
4950         }
4951       break;
4952
4953     default:
4954       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4955     }
4956
4957   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4958       && coding->mode & CODING_MODE_LAST_BLOCK
4959       && coding->consumed == src_bytes)
4960     coding->result = CODING_FINISH_NORMAL;
4961
4962   if (coding->mode & CODING_MODE_LAST_BLOCK
4963       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4964     {
4965       const unsigned char *src = source + coding->consumed;
4966       unsigned char *dst = destination + coding->produced;
4967
4968       src_bytes -= coding->consumed;
4969       coding->errors++;
4970       if (COMPOSING_P (coding))
4971         DECODE_COMPOSITION_END ('1');
4972       while (src_bytes--)
4973         {
4974           int c = *src++;
4975           dst += CHAR_STRING (c, dst);
4976           coding->produced_char++;
4977         }
4978       coding->consumed = coding->consumed_char = src - source;
4979       coding->produced = dst - destination;
4980       coding->result = CODING_FINISH_NORMAL;
4981     }
4982
4983   if (!coding->dst_multibyte)
4984     {
4985       coding->produced = str_as_unibyte (destination, coding->produced);
4986       coding->produced_char = coding->produced;
4987     }
4988
4989   return coding->result;
4990 }
4991
4992 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4993    multibyteness of the source is CODING->src_multibyte, the
4994    multibyteness of the result is always unibyte.  */
4995
4996 int
4997 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4998      struct coding_system *coding;
4999      const unsigned char *source;
5000      unsigned char *destination;
5001      int src_bytes, dst_bytes;
5002 {
5003   coding->produced = coding->produced_char = 0;
5004   coding->consumed = coding->consumed_char = 0;
5005   coding->errors = 0;
5006   coding->result = CODING_FINISH_NORMAL;
5007   if (coding->eol_type == CODING_EOL_UNDECIDED)
5008     coding->eol_type = CODING_EOL_LF;
5009
5010   switch (coding->type)
5011     {
5012     case coding_type_sjis:
5013       encode_coding_sjis_big5 (coding, source, destination,
5014                                src_bytes, dst_bytes, 1);
5015       break;
5016
5017     case coding_type_iso2022:
5018       encode_coding_iso2022 (coding, source, destination,
5019                              src_bytes, dst_bytes);
5020       break;
5021
5022     case coding_type_big5:
5023       encode_coding_sjis_big5 (coding, source, destination,
5024                                src_bytes, dst_bytes, 0);
5025       break;
5026
5027     case coding_type_emacs_mule:
5028       encode_coding_emacs_mule (coding, source, destination,
5029                                 src_bytes, dst_bytes);
5030       break;
5031
5032     case coding_type_ccl:
5033       ccl_coding_driver (coding, source, destination,
5034                          src_bytes, dst_bytes, 1);
5035       break;
5036
5037     default:
5038       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5039     }
5040
5041   if (coding->mode & CODING_MODE_LAST_BLOCK
5042       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5043     {
5044       const unsigned char *src = source + coding->consumed;
5045       unsigned char *dst = destination + coding->produced;
5046
5047       if (coding->type == coding_type_iso2022)
5048         ENCODE_RESET_PLANE_AND_REGISTER;
5049       if (COMPOSING_P (coding))
5050         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5051       if (coding->consumed < src_bytes)
5052         {
5053           int len = src_bytes - coding->consumed;
5054
5055           BCOPY_SHORT (src, dst, len);
5056           if (coding->src_multibyte)
5057             len = str_as_unibyte (dst, len);
5058           dst += len;
5059           coding->consumed = src_bytes;
5060         }
5061       coding->produced = coding->produced_char = dst - destination;
5062       coding->result = CODING_FINISH_NORMAL;
5063     }
5064
5065   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5066       && coding->consumed == src_bytes)
5067     coding->result = CODING_FINISH_NORMAL;
5068
5069   return coding->result;
5070 }
5071
5072 /* Scan text in the region between *BEG and *END (byte positions),
5073    skip characters which we don't have to decode by coding system
5074    CODING at the head and tail, then set *BEG and *END to the region
5075    of the text we actually have to convert.  The caller should move
5076    the gap out of the region in advance if the region is from a
5077    buffer.
5078
5079    If STR is not NULL, *BEG and *END are indices into STR.  */
5080
5081 static void
5082 shrink_decoding_region (beg, end, coding, str)
5083      int *beg, *end;
5084      struct coding_system *coding;
5085      unsigned char *str;
5086 {
5087   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5088   int eol_conversion;
5089   Lisp_Object translation_table;
5090
5091   if (coding->type == coding_type_ccl
5092       || coding->type == coding_type_undecided
5093       || coding->eol_type != CODING_EOL_LF
5094       || !NILP (coding->post_read_conversion)
5095       || coding->composing != COMPOSITION_DISABLED)
5096     {
5097       /* We can't skip any data.  */
5098       return;
5099     }
5100   if (coding->type == coding_type_no_conversion
5101       || coding->type == coding_type_raw_text
5102       || coding->type == coding_type_emacs_mule)
5103     {
5104       /* We need no conversion, but don't have to skip any data here.
5105          Decoding routine handles them effectively anyway.  */
5106       return;
5107     }
5108
5109   translation_table = coding->translation_table_for_decode;
5110   if (NILP (translation_table) && !NILP (Venable_character_translation))
5111     translation_table = Vstandard_translation_table_for_decode;
5112   if (CHAR_TABLE_P (translation_table))
5113     {
5114       int i;
5115       for (i = 0; i < 128; i++)
5116         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5117           break;
5118       if (i < 128)
5119         /* Some ASCII character should be translated.  We give up
5120            shrinking.  */
5121         return;
5122     }
5123
5124   if (coding->heading_ascii >= 0)
5125     /* Detection routine has already found how much we can skip at the
5126        head.  */
5127     *beg += coding->heading_ascii;
5128
5129   if (str)
5130     {
5131       begp_orig = begp = str + *beg;
5132       endp_orig = endp = str + *end;
5133     }
5134   else
5135     {
5136       begp_orig = begp = BYTE_POS_ADDR (*beg);
5137       endp_orig = endp = begp + *end - *beg;
5138     }
5139
5140   eol_conversion = (coding->eol_type == CODING_EOL_CR
5141                     || coding->eol_type == CODING_EOL_CRLF);
5142
5143   switch (coding->type)
5144     {
5145     case coding_type_sjis:
5146     case coding_type_big5:
5147       /* We can skip all ASCII characters at the head.  */
5148       if (coding->heading_ascii < 0)
5149         {
5150           if (eol_conversion)
5151             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5152           else
5153             while (begp < endp && *begp < 0x80) begp++;
5154         }
5155       /* We can skip all ASCII characters at the tail except for the
5156          second byte of SJIS or BIG5 code.  */
5157       if (eol_conversion)
5158         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5159       else
5160         while (begp < endp && endp[-1] < 0x80) endp--;
5161       /* Do not consider LF as ascii if preceded by CR, since that
5162          confuses eol decoding. */
5163       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5164         endp++;
5165       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5166         endp++;
5167       break;
5168
5169     case coding_type_iso2022:
5170       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5171         /* We can't skip any data.  */
5172         break;
5173       if (coding->heading_ascii < 0)
5174         {
5175           /* We can skip all ASCII characters at the head except for a
5176              few control codes.  */
5177           while (begp < endp && (c = *begp) < 0x80
5178                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5179                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5180                  && (!eol_conversion || c != ISO_CODE_LF))
5181             begp++;
5182         }
5183       switch (coding->category_idx)
5184         {
5185         case CODING_CATEGORY_IDX_ISO_8_1:
5186         case CODING_CATEGORY_IDX_ISO_8_2:
5187           /* We can skip all ASCII characters at the tail.  */
5188           if (eol_conversion)
5189             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5190           else
5191             while (begp < endp && endp[-1] < 0x80) endp--;
5192           /* Do not consider LF as ascii if preceded by CR, since that
5193              confuses eol decoding. */
5194           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5195             endp++;
5196           break;
5197
5198         case CODING_CATEGORY_IDX_ISO_7:
5199         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5200           {
5201             /* We can skip all characters at the tail except for 8-bit
5202                codes and ESC and the following 2-byte at the tail.  */
5203             unsigned char *eight_bit = NULL;
5204
5205             if (eol_conversion)
5206               while (begp < endp
5207                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5208                 {
5209                   if (!eight_bit && c & 0x80) eight_bit = endp;
5210                   endp--;
5211                 }
5212             else
5213               while (begp < endp
5214                      && (c = endp[-1]) != ISO_CODE_ESC)
5215                 {
5216                   if (!eight_bit && c & 0x80) eight_bit = endp;
5217                   endp--;
5218                 }
5219             /* Do not consider LF as ascii if preceded by CR, since that
5220                confuses eol decoding. */
5221             if (begp < endp && endp < endp_orig
5222                 && endp[-1] == '\r' && endp[0] == '\n')
5223               endp++;
5224             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5225               {
5226                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5227                   /* This is an ASCII designation sequence.  We can
5228                      surely skip the tail.  But, if we have
5229                      encountered an 8-bit code, skip only the codes
5230                      after that.  */
5231                   endp = eight_bit ? eight_bit : endp + 2;
5232                 else
5233                   /* Hmmm, we can't skip the tail.  */
5234                   endp = endp_orig;
5235               }
5236             else if (eight_bit)
5237               endp = eight_bit;
5238           }
5239         }
5240       break;
5241
5242     default:
5243       abort ();
5244     }
5245   *beg += begp - begp_orig;
5246   *end += endp - endp_orig;
5247   return;
5248 }
5249
5250 /* Like shrink_decoding_region but for encoding.  */
5251
5252 static void
5253 shrink_encoding_region (beg, end, coding, str)
5254      int *beg, *end;
5255      struct coding_system *coding;
5256      unsigned char *str;
5257 {
5258   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5259   int eol_conversion;
5260   Lisp_Object translation_table;
5261
5262   if (coding->type == coding_type_ccl
5263       || coding->eol_type == CODING_EOL_CRLF
5264       || coding->eol_type == CODING_EOL_CR
5265       || (coding->cmp_data && coding->cmp_data->used > 0))
5266     {
5267       /* We can't skip any data.  */
5268       return;
5269     }
5270   if (coding->type == coding_type_no_conversion
5271       || coding->type == coding_type_raw_text
5272       || coding->type == coding_type_emacs_mule
5273       || coding->type == coding_type_undecided)
5274     {
5275       /* We need no conversion, but don't have to skip any data here.
5276          Encoding routine handles them effectively anyway.  */
5277       return;
5278     }
5279
5280   translation_table = coding->translation_table_for_encode;
5281   if (NILP (translation_table) && !NILP (Venable_character_translation))
5282     translation_table = Vstandard_translation_table_for_encode;
5283   if (CHAR_TABLE_P (translation_table))
5284     {
5285       int i;
5286       for (i = 0; i < 128; i++)
5287         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5288           break;
5289       if (i < 128)
5290         /* Some ASCII character should be translated.  We give up
5291            shrinking.  */
5292         return;
5293     }
5294
5295   if (str)
5296     {
5297       begp_orig = begp = str + *beg;
5298       endp_orig = endp = str + *end;
5299     }
5300   else
5301     {
5302       begp_orig = begp = BYTE_POS_ADDR (*beg);
5303       endp_orig = endp = begp + *end - *beg;
5304     }
5305
5306   eol_conversion = (coding->eol_type == CODING_EOL_CR
5307                     || coding->eol_type == CODING_EOL_CRLF);
5308
5309   /* Here, we don't have to check coding->pre_write_conversion because
5310      the caller is expected to have handled it already.  */
5311   switch (coding->type)
5312     {
5313     case coding_type_iso2022:
5314       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5315         /* We can't skip any data.  */
5316         break;
5317       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5318         {
5319           unsigned char *bol = begp;
5320           while (begp < endp && *begp < 0x80)
5321             {
5322               begp++;
5323               if (begp[-1] == '\n')
5324                 bol = begp;
5325             }
5326           begp = bol;
5327           goto label_skip_tail;
5328         }
5329       /* fall down ... */
5330
5331     case coding_type_sjis:
5332     case coding_type_big5:
5333       /* We can skip all ASCII characters at the head and tail.  */
5334       if (eol_conversion)
5335         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5336       else
5337         while (begp < endp && *begp < 0x80) begp++;
5338     label_skip_tail:
5339       if (eol_conversion)
5340         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5341       else
5342         while (begp < endp && *(endp - 1) < 0x80) endp--;
5343       break;
5344
5345     default:
5346       abort ();
5347     }
5348
5349   *beg += begp - begp_orig;
5350   *end += endp - endp_orig;
5351   return;
5352 }
5353
5354 /* As shrinking conversion region requires some overhead, we don't try
5355    shrinking if the length of conversion region is less than this
5356    value.  */
5357 static int shrink_conversion_region_threshhold = 1024;
5358
5359 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5360   do {                                                                  \
5361     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5362       {                                                                 \
5363         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5364         else shrink_decoding_region (beg, end, coding, str);            \
5365       }                                                                 \
5366   } while (0)
5367
5368 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5369    Vlast_coding_system_used and the remaining elements are buffers to
5370    kill.  */
5371 static Lisp_Object
5372 code_convert_region_unwind (arg)
5373      Lisp_Object arg;
5374 {
5375   struct gcpro gcpro1;
5376   GCPRO1 (arg);
5377
5378   inhibit_pre_post_conversion = 0;
5379   Vlast_coding_system_used = XCAR (arg);
5380   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5381     Fkill_buffer (XCAR (arg));
5382
5383   UNGCPRO;
5384   return Qnil;
5385 }
5386
5387 /* Store information about all compositions in the range FROM and TO
5388    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5389    buffer or a string, defaults to the current buffer.  */
5390
5391 void
5392 coding_save_composition (coding, from, to, obj)
5393      struct coding_system *coding;
5394      int from, to;
5395      Lisp_Object obj;
5396 {
5397   Lisp_Object prop;
5398   int start, end;
5399
5400   if (coding->composing == COMPOSITION_DISABLED)
5401     return;
5402   if (!coding->cmp_data)
5403     coding_allocate_composition_data (coding, from);
5404   if (!find_composition (from, to, &start, &end, &prop, obj)
5405       || end > to)
5406     return;
5407   if (start < from
5408       && (!find_composition (end, to, &start, &end, &prop, obj)
5409           || end > to))
5410     return;
5411   coding->composing = COMPOSITION_NO;
5412   do
5413     {
5414       if (COMPOSITION_VALID_P (start, end, prop))
5415         {
5416           enum composition_method method = COMPOSITION_METHOD (prop);
5417           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5418               >= COMPOSITION_DATA_SIZE)
5419             coding_allocate_composition_data (coding, from);
5420           /* For relative composition, we remember start and end
5421              positions, for the other compositions, we also remember
5422              components.  */
5423           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5424           if (method != COMPOSITION_RELATIVE)
5425             {
5426               /* We must store a*/
5427               Lisp_Object val, ch;
5428
5429               val = COMPOSITION_COMPONENTS (prop);
5430               if (CONSP (val))
5431                 while (CONSP (val))
5432                   {
5433                     ch = XCAR (val), val = XCDR (val);
5434                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5435                   }
5436               else if (VECTORP (val) || STRINGP (val))
5437                 {
5438                   int len = (VECTORP (val)
5439                              ? XVECTOR (val)->size : SCHARS (val));
5440                   int i;
5441                   for (i = 0; i < len; i++)
5442                     {
5443                       ch = (STRINGP (val)
5444                             ? Faref (val, make_number (i))
5445                             : XVECTOR (val)->contents[i]);
5446                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5447                     }
5448                 }
5449               else              /* INTEGERP (val) */
5450                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5451             }
5452           CODING_ADD_COMPOSITION_END (coding, end - from);
5453         }
5454       start = end;
5455     }
5456   while (start < to
5457          && find_composition (start, to, &start, &end, &prop, obj)
5458          && end <= to);
5459
5460   /* Make coding->cmp_data point to the first memory block.  */
5461   while (coding->cmp_data->prev)
5462     coding->cmp_data = coding->cmp_data->prev;
5463   coding->cmp_data_start = 0;
5464 }
5465
5466 /* Reflect the saved information about compositions to OBJ.
5467    CODING->cmp_data points to a memory block for the information.  OBJ
5468    is a buffer or a string, defaults to the current buffer.  */
5469
5470 void
5471 coding_restore_composition (coding, obj)
5472      struct coding_system *coding;
5473      Lisp_Object obj;
5474 {
5475   struct composition_data *cmp_data = coding->cmp_data;
5476
5477   if (!cmp_data)
5478     return;
5479
5480   while (cmp_data->prev)
5481     cmp_data = cmp_data->prev;
5482
5483   while (cmp_data)
5484     {
5485       int i;
5486
5487       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5488            i += cmp_data->data[i])
5489         {
5490           int *data = cmp_data->data + i;
5491           enum composition_method method = (enum composition_method) data[3];
5492           Lisp_Object components;
5493
5494           if (data[0] < 0 || i + data[0] > cmp_data->used)
5495             /* Invalid composition data.  */
5496             break;
5497
5498           if (method == COMPOSITION_RELATIVE)
5499             components = Qnil;
5500           else
5501             {
5502               int len = data[0] - 4, j;
5503               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5504
5505               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5506                   && len % 2 == 0)
5507                 len --;
5508               if (len < 1)
5509                 /* Invalid composition data.  */
5510                 break;
5511               for (j = 0; j < len; j++)
5512                 args[j] = make_number (data[4 + j]);
5513               components = (method == COMPOSITION_WITH_ALTCHARS
5514                             ? Fstring (len, args)
5515                             : Fvector (len, args));
5516             }
5517           compose_text (data[1], data[2], components, Qnil, obj);
5518         }
5519       cmp_data = cmp_data->next;
5520     }
5521 }
5522
5523 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5524    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5525    coding system CODING, and return the status code of code conversion
5526    (currently, this value has no meaning).
5527
5528    How many characters (and bytes) are converted to how many
5529    characters (and bytes) are recorded in members of the structure
5530    CODING.
5531
5532    If REPLACE is nonzero, we do various things as if the original text
5533    is deleted and a new text is inserted.  See the comments in
5534    replace_range (insdel.c) to know what we are doing.
5535
5536    If REPLACE is zero, it is assumed that the source text is unibyte.
5537    Otherwise, it is assumed that the source text is multibyte.  */
5538
5539 int
5540 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5541      int from, from_byte, to, to_byte, encodep, replace;
5542      struct coding_system *coding;
5543 {
5544   int len = to - from, len_byte = to_byte - from_byte;
5545   int nchars_del = 0, nbytes_del = 0;
5546   int require, inserted, inserted_byte;
5547   int head_skip, tail_skip, total_skip = 0;
5548   Lisp_Object saved_coding_symbol;
5549   int first = 1;
5550   unsigned char *src, *dst;
5551   Lisp_Object deletion;
5552   int orig_point = PT, orig_len = len;
5553   int prev_Z;
5554   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5555
5556   deletion = Qnil;
5557   saved_coding_symbol = coding->symbol;
5558
5559   if (from < PT && PT < to)
5560     {
5561       TEMP_SET_PT_BOTH (from, from_byte);
5562       orig_point = from;
5563     }
5564
5565   if (replace)
5566     {
5567       int saved_from = from;
5568       int saved_inhibit_modification_hooks;
5569
5570       prepare_to_modify_buffer (from, to, &from);
5571       if (saved_from != from)
5572         {
5573           to = from + len;
5574           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5575           len_byte = to_byte - from_byte;
5576         }
5577
5578       /* The code conversion routine can not preserve text properties
5579          for now.  So, we must remove all text properties in the
5580          region.  Here, we must suppress all modification hooks.  */
5581       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5582       inhibit_modification_hooks = 1;
5583       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5584       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5585     }
5586
5587   coding->heading_ascii = 0;
5588
5589   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5590     {
5591       /* We must detect encoding of text and eol format.  */
5592
5593       if (from < GPT && to > GPT)
5594         move_gap_both (from, from_byte);
5595       if (coding->type == coding_type_undecided)
5596         {
5597           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5598           if (coding->type == coding_type_undecided)
5599             {
5600               /* It seems that the text contains only ASCII, but we
5601                  should not leave it undecided because the deeper
5602                  decoding routine (decode_coding) tries to detect the
5603                  encodings again in vain.  */
5604               coding->type = coding_type_emacs_mule;
5605               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5606               /* As emacs-mule decoder will handle composition, we
5607                  need this setting to allocate coding->cmp_data
5608                  later.  */
5609               coding->composing = COMPOSITION_NO;
5610             }
5611         }
5612       if (coding->eol_type == CODING_EOL_UNDECIDED
5613           && coding->type != coding_type_ccl)
5614         {
5615           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5616           if (coding->eol_type == CODING_EOL_UNDECIDED)
5617             coding->eol_type = CODING_EOL_LF;
5618           /* We had better recover the original eol format if we
5619              encounter an inconsistent eol format while decoding.  */
5620           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5621         }
5622     }
5623
5624   /* Now we convert the text.  */
5625
5626   /* For encoding, we must process pre-write-conversion in advance.  */
5627   if (! inhibit_pre_post_conversion
5628       && encodep
5629       && SYMBOLP (coding->pre_write_conversion)
5630       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5631     {
5632       /* The function in pre-write-conversion may put a new text in a
5633          new buffer.  */
5634       struct buffer *prev = current_buffer;
5635       Lisp_Object new;
5636
5637       record_unwind_protect (code_convert_region_unwind,
5638                              Fcons (Vlast_coding_system_used, Qnil));
5639       /* We should not call any more pre-write/post-read-conversion
5640          functions while this pre-write-conversion is running.  */
5641       inhibit_pre_post_conversion = 1;
5642       call2 (coding->pre_write_conversion,
5643              make_number (from), make_number (to));
5644       inhibit_pre_post_conversion = 0;
5645       /* Discard the unwind protect.  */
5646       specpdl_ptr--;
5647
5648       if (current_buffer != prev)
5649         {
5650           len = ZV - BEGV;
5651           new = Fcurrent_buffer ();
5652           set_buffer_internal_1 (prev);
5653           del_range_2 (from, from_byte, to, to_byte, 0);
5654           TEMP_SET_PT_BOTH (from, from_byte);
5655           insert_from_buffer (XBUFFER (new), 1, len, 0);
5656           Fkill_buffer (new);
5657           if (orig_point >= to)
5658             orig_point += len - orig_len;
5659           else if (orig_point > from)
5660             orig_point = from;
5661           orig_len = len;
5662           to = from + len;
5663           from_byte = CHAR_TO_BYTE (from);
5664           to_byte = CHAR_TO_BYTE (to);
5665           len_byte = to_byte - from_byte;
5666           TEMP_SET_PT_BOTH (from, from_byte);
5667         }
5668     }
5669
5670   if (replace)
5671     {
5672       if (! EQ (current_buffer->undo_list, Qt))
5673         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5674       else
5675         {
5676           nchars_del = to - from;
5677           nbytes_del = to_byte - from_byte;
5678         }
5679     }
5680
5681   if (coding->composing != COMPOSITION_DISABLED)
5682     {
5683       if (encodep)
5684         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5685       else
5686         coding_allocate_composition_data (coding, from);
5687     }
5688
5689   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5690      if we must run CCL program or there are compositions to
5691      encode.  */
5692   if (coding->type != coding_type_ccl
5693       && (! coding->cmp_data || coding->cmp_data->used == 0))
5694     {
5695       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5696
5697       if (from < GPT && GPT < to)
5698         move_gap_both (from, from_byte);
5699       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5700       if (from_byte == to_byte
5701           && (encodep || NILP (coding->post_read_conversion))
5702           && ! CODING_REQUIRE_FLUSHING (coding))
5703         {
5704           coding->produced = len_byte;
5705           coding->produced_char = len;
5706           if (!replace)
5707             /* We must record and adjust for this new text now.  */
5708             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5709           coding_free_composition_data (coding);
5710           return 0;
5711         }
5712
5713       head_skip = from_byte - from_byte_orig;
5714       tail_skip = to_byte_orig - to_byte;
5715       total_skip = head_skip + tail_skip;
5716       from += head_skip;
5717       to -= tail_skip;
5718       len -= total_skip; len_byte -= total_skip;
5719     }
5720
5721   /* For conversion, we must put the gap before the text in addition to
5722      making the gap larger for efficient decoding.  The required gap
5723      size starts from 2000 which is the magic number used in make_gap.
5724      But, after one batch of conversion, it will be incremented if we
5725      find that it is not enough .  */
5726   require = 2000;
5727
5728   if (GAP_SIZE  < require)
5729     make_gap (require - GAP_SIZE);
5730   move_gap_both (from, from_byte);
5731
5732   inserted = inserted_byte = 0;
5733
5734   GAP_SIZE += len_byte;
5735   ZV -= len;
5736   Z -= len;
5737   ZV_BYTE -= len_byte;
5738   Z_BYTE -= len_byte;
5739
5740   if (GPT - BEG < BEG_UNCHANGED)
5741     BEG_UNCHANGED = GPT - BEG;
5742   if (Z - GPT < END_UNCHANGED)
5743     END_UNCHANGED = Z - GPT;
5744
5745   if (!encodep && coding->src_multibyte)
5746     {
5747       /* Decoding routines expects that the source text is unibyte.
5748          We must convert 8-bit characters of multibyte form to
5749          unibyte.  */
5750       int len_byte_orig = len_byte;
5751       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5752       if (len_byte < len_byte_orig)
5753         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5754                     len_byte);
5755       coding->src_multibyte = 0;
5756     }
5757
5758   for (;;)
5759     {
5760       int result;
5761
5762       /* The buffer memory is now:
5763          +--------+converted-text+---------+-------original-text-------+---+
5764          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5765                   |<---------------------- GAP ----------------------->|  */
5766       src = GAP_END_ADDR - len_byte;
5767       dst = GPT_ADDR + inserted_byte;
5768
5769       if (encodep)
5770         result = encode_coding (coding, src, dst, len_byte, 0);
5771       else
5772         {
5773           if (coding->composing != COMPOSITION_DISABLED)
5774             coding->cmp_data->char_offset = from + inserted;
5775           result = decode_coding (coding, src, dst, len_byte, 0);
5776         }
5777
5778       /* The buffer memory is now:
5779          +--------+-------converted-text----+--+------original-text----+---+
5780          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5781                   |<---------------------- GAP ----------------------->|  */
5782
5783       inserted += coding->produced_char;
5784       inserted_byte += coding->produced;
5785       len_byte -= coding->consumed;
5786
5787       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5788         {
5789           coding_allocate_composition_data (coding, from + inserted);
5790           continue;
5791         }
5792
5793       src += coding->consumed;
5794       dst += coding->produced;
5795
5796       if (result == CODING_FINISH_NORMAL)
5797         {
5798           src += len_byte;
5799           break;
5800         }
5801       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5802         {
5803           unsigned char *pend = dst, *p = pend - inserted_byte;
5804           Lisp_Object eol_type;
5805
5806           /* Encode LFs back to the original eol format (CR or CRLF).  */
5807           if (coding->eol_type == CODING_EOL_CR)
5808             {
5809               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5810             }
5811           else
5812             {
5813               int count = 0;
5814
5815               while (p < pend) if (*p++ == '\n') count++;
5816               if (src - dst < count)
5817                 {
5818                   /* We don't have sufficient room for encoding LFs
5819                      back to CRLF.  We must record converted and
5820                      not-yet-converted text back to the buffer
5821                      content, enlarge the gap, then record them out of
5822                      the buffer contents again.  */
5823                   int add = len_byte + inserted_byte;
5824
5825                   GAP_SIZE -= add;
5826                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5827                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5828                   make_gap (count - GAP_SIZE);
5829                   GAP_SIZE += add;
5830                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5831                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5832                   /* Don't forget to update SRC, DST, and PEND.  */
5833                   src = GAP_END_ADDR - len_byte;
5834                   dst = GPT_ADDR + inserted_byte;
5835                   pend = dst;
5836                 }
5837               inserted += count;
5838               inserted_byte += count;
5839               coding->produced += count;
5840               p = dst = pend + count;
5841               while (count)
5842                 {
5843                   *--p = *--pend;
5844                   if (*p == '\n') count--, *--p = '\r';
5845                 }
5846             }
5847
5848           /* Suppress eol-format conversion in the further conversion.  */
5849           coding->eol_type = CODING_EOL_LF;
5850
5851           /* Set the coding system symbol to that for Unix-like EOL.  */
5852           eol_type = Fget (saved_coding_symbol, Qeol_type);
5853           if (VECTORP (eol_type)
5854               && XVECTOR (eol_type)->size == 3
5855               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5856             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5857           else
5858             coding->symbol = saved_coding_symbol;
5859
5860           continue;
5861         }
5862       if (len_byte <= 0)
5863         {
5864           if (coding->type != coding_type_ccl
5865               || coding->mode & CODING_MODE_LAST_BLOCK)
5866             break;
5867           coding->mode |= CODING_MODE_LAST_BLOCK;
5868           continue;
5869         }
5870       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5871         {
5872           /* The source text ends in invalid codes.  Let's just
5873              make them valid buffer contents, and finish conversion.  */
5874           if (multibyte_p)
5875             {
5876               unsigned char *start = dst;
5877
5878               inserted += len_byte;
5879               while (len_byte--)
5880                 {
5881                   int c = *src++;
5882                   dst += CHAR_STRING (c, dst);
5883                 }
5884
5885               inserted_byte += dst - start;
5886             }
5887           else
5888             {
5889               inserted += len_byte;
5890               inserted_byte += len_byte;
5891               while (len_byte--)
5892                 *dst++ = *src++;
5893             }
5894           break;
5895         }
5896       if (result == CODING_FINISH_INTERRUPT)
5897         {
5898           /* The conversion procedure was interrupted by a user.  */
5899           break;
5900         }
5901       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5902       if (coding->consumed < 1)
5903         {
5904           /* It's quite strange to require more memory without
5905              consuming any bytes.  Perhaps CCL program bug.  */
5906           break;
5907         }
5908       if (first)
5909         {
5910           /* We have just done the first batch of conversion which was
5911              stopped because of insufficient gap.  Let's reconsider the
5912              required gap size (i.e. SRT - DST) now.
5913
5914              We have converted ORIG bytes (== coding->consumed) into
5915              NEW bytes (coding->produced).  To convert the remaining
5916              LEN bytes, we may need REQUIRE bytes of gap, where:
5917                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5918                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5919              Here, we are sure that NEW >= ORIG.  */
5920
5921           if (coding->produced <= coding->consumed)
5922             {
5923               /* This happens because of CCL-based coding system with
5924                  eol-type CRLF.  */
5925               require = 0;
5926             }
5927           else
5928             {
5929               float ratio = coding->produced - coding->consumed;
5930               ratio /= coding->consumed;
5931               require = len_byte * ratio;
5932             }
5933           first = 0;
5934         }
5935       if ((src - dst) < (require + 2000))
5936         {
5937           /* See the comment above the previous call of make_gap.  */
5938           int add = len_byte + inserted_byte;
5939
5940           GAP_SIZE -= add;
5941           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5942           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5943           make_gap (require + 2000);
5944           GAP_SIZE += add;
5945           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5946           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5947         }
5948     }
5949   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5950
5951   if (encodep && coding->dst_multibyte)
5952     {
5953       /* The output is unibyte.  We must convert 8-bit characters to
5954          multibyte form.  */
5955       if (inserted_byte * 2 > GAP_SIZE)
5956         {
5957           GAP_SIZE -= inserted_byte;
5958           ZV += inserted_byte; Z += inserted_byte;
5959           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5960           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5961           make_gap (inserted_byte - GAP_SIZE);
5962           GAP_SIZE += inserted_byte;
5963           ZV -= inserted_byte; Z -= inserted_byte;
5964           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5965           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5966         }
5967       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5968     }
5969
5970   /* If we shrank the conversion area, adjust it now.  */
5971   if (total_skip > 0)
5972     {
5973       if (tail_skip > 0)
5974         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5975       inserted += total_skip; inserted_byte += total_skip;
5976       GAP_SIZE += total_skip;
5977       GPT -= head_skip; GPT_BYTE -= head_skip;
5978       ZV -= total_skip; ZV_BYTE -= total_skip;
5979       Z -= total_skip; Z_BYTE -= total_skip;
5980       from -= head_skip; from_byte -= head_skip;
5981       to += tail_skip; to_byte += tail_skip;
5982     }
5983
5984   prev_Z = Z;
5985   if (! EQ (current_buffer->undo_list, Qt))
5986     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5987   else
5988     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5989                                  inserted, inserted_byte);
5990   inserted = Z - prev_Z;
5991
5992   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5993     coding_restore_composition (coding, Fcurrent_buffer ());
5994   coding_free_composition_data (coding);
5995
5996   if (! inhibit_pre_post_conversion
5997       && ! encodep && ! NILP (coding->post_read_conversion))
5998     {
5999       Lisp_Object val;
6000       Lisp_Object saved_coding_system;
6001
6002       if (from != PT)
6003         TEMP_SET_PT_BOTH (from, from_byte);
6004       prev_Z = Z;
6005       record_unwind_protect (code_convert_region_unwind,
6006                              Fcons (Vlast_coding_system_used, Qnil));
6007       saved_coding_system = Vlast_coding_system_used;
6008       Vlast_coding_system_used = coding->symbol;
6009       /* We should not call any more pre-write/post-read-conversion
6010          functions while this post-read-conversion is running.  */
6011       inhibit_pre_post_conversion = 1;
6012       val = call1 (coding->post_read_conversion, make_number (inserted));
6013       inhibit_pre_post_conversion = 0;
6014       coding->symbol = Vlast_coding_system_used;
6015       Vlast_coding_system_used = saved_coding_system;
6016       /* Discard the unwind protect.  */
6017       specpdl_ptr--;
6018       CHECK_NUMBER (val);
6019       inserted += Z - prev_Z;
6020     }
6021
6022   if (orig_point >= from)
6023     {
6024       if (orig_point >= from + orig_len)
6025         orig_point += inserted - orig_len;
6026       else
6027         orig_point = from;
6028       TEMP_SET_PT (orig_point);
6029     }
6030
6031   if (replace)
6032     {
6033       signal_after_change (from, to - from, inserted);
6034       update_compositions (from, from + inserted, CHECK_BORDER);
6035     }
6036
6037   {
6038     coding->consumed = to_byte - from_byte;
6039     coding->consumed_char = to - from;
6040     coding->produced = inserted_byte;
6041     coding->produced_char = inserted;
6042   }
6043
6044   return 0;
6045 }
6046
6047 /* Name (or base name) of work buffer for code conversion.  */
6048 static Lisp_Object Vcode_conversion_workbuf_name;
6049
6050 /* Set the current buffer to the working buffer prepared for
6051    code-conversion.  MULTIBYTE specifies the multibyteness of the
6052    buffer.  Return the buffer we set if it must be killed after use.
6053    Otherwise return Qnil.  */
6054
6055 static Lisp_Object
6056 set_conversion_work_buffer (multibyte)
6057      int multibyte;
6058 {
6059   Lisp_Object buffer, buffer_to_kill;
6060   struct buffer *buf;
6061
6062   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6063   buf = XBUFFER (buffer);
6064   if (buf == current_buffer)
6065     {
6066       /* As we are already in the work buffer, we must generate a new
6067          buffer for the work.  */
6068       Lisp_Object name;
6069
6070       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6071       buffer = buffer_to_kill = Fget_buffer_create (name);
6072       buf = XBUFFER (buffer);
6073     }
6074   else
6075     buffer_to_kill = Qnil;
6076
6077   delete_all_overlays (buf);
6078   buf->directory = current_buffer->directory;
6079   buf->read_only = Qnil;
6080   buf->filename = Qnil;
6081   buf->undo_list = Qt;
6082   eassert (buf->overlays_before == NULL);
6083   eassert (buf->overlays_after == NULL);
6084   set_buffer_internal (buf);
6085   if (BEG != BEGV || Z != ZV)
6086     Fwiden ();
6087   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6088   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6089   return buffer_to_kill;
6090 }
6091
6092 Lisp_Object
6093 run_pre_post_conversion_on_str (str, coding, encodep)
6094      Lisp_Object str;
6095      struct coding_system *coding;
6096      int encodep;
6097 {
6098   int count = SPECPDL_INDEX ();
6099   struct gcpro gcpro1, gcpro2;
6100   int multibyte = STRING_MULTIBYTE (str);
6101   Lisp_Object old_deactivate_mark;
6102   Lisp_Object buffer_to_kill;
6103   Lisp_Object unwind_arg;
6104
6105   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6106   /* It is not crucial to specbind this.  */
6107   old_deactivate_mark = Vdeactivate_mark;
6108   GCPRO2 (str, old_deactivate_mark);
6109
6110   /* We must insert the contents of STR as is without
6111      unibyte<->multibyte conversion.  For that, we adjust the
6112      multibyteness of the working buffer to that of STR.  */
6113   buffer_to_kill = set_conversion_work_buffer (multibyte);
6114   if (NILP (buffer_to_kill))
6115     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6116   else
6117     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6118   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6119
6120   insert_from_string (str, 0, 0,
6121                       SCHARS (str), SBYTES (str), 0);
6122   UNGCPRO;
6123   inhibit_pre_post_conversion = 1;
6124   if (encodep)
6125     {
6126       struct buffer *prev = current_buffer;
6127
6128       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6129       if (prev != current_buffer)
6130         /* We must kill the current buffer too.  */
6131         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6132     }
6133   else
6134     {
6135       Vlast_coding_system_used = coding->symbol;
6136       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6137       call1 (coding->post_read_conversion, make_number (Z - BEG));
6138       coding->symbol = Vlast_coding_system_used;
6139     }
6140   inhibit_pre_post_conversion = 0;
6141   Vdeactivate_mark = old_deactivate_mark;
6142   str = make_buffer_string (BEG, Z, 1);
6143   return unbind_to (count, str);
6144 }
6145
6146
6147 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6148    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6149    is intended that this function is called from encode_terminal_code,
6150    the pre-write-conversion function is run by safe_call and thus
6151    "Error during redisplay: ..." is logged when an error occurs.
6152
6153    Store the resulting text in *STR and set CODING->produced_char and
6154    CODING->produced to the number of characters and bytes
6155    respectively.  If the size of *STR is too small, enlarge it by
6156    xrealloc and update *STR and *SIZE.  */
6157
6158 void
6159 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6160      unsigned char **str;
6161      int *size, nchars, nbytes;
6162      struct coding_system *coding;
6163 {
6164   struct gcpro gcpro1, gcpro2;
6165   struct buffer *cur = current_buffer;
6166   struct buffer *prev;
6167   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6168   Lisp_Object args[3];
6169   Lisp_Object buffer_to_kill;
6170
6171   /* It is not crucial to specbind this.  */
6172   old_deactivate_mark = Vdeactivate_mark;
6173   old_last_coding_system_used = Vlast_coding_system_used;
6174   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6175
6176   /* We must insert the contents of STR as is without
6177      unibyte<->multibyte conversion.  For that, we adjust the
6178      multibyteness of the working buffer to that of STR.  */
6179   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6180   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6181   UNGCPRO;
6182   inhibit_pre_post_conversion = 1;
6183   prev = current_buffer;
6184   args[0] = coding->pre_write_conversion;
6185   args[1] = make_number (BEG);
6186   args[2] = make_number (Z);
6187   safe_call (3, args);
6188   inhibit_pre_post_conversion = 0;
6189   Vdeactivate_mark = old_deactivate_mark;
6190   Vlast_coding_system_used = old_last_coding_system_used;
6191   coding->produced_char = Z - BEG;
6192   coding->produced = Z_BYTE - BEG_BYTE;
6193   if (coding->produced > *size)
6194     {
6195       *size = coding->produced;
6196       *str = xrealloc (*str, *size);
6197     }
6198   if (BEG < GPT && GPT < Z)
6199     move_gap (BEG);
6200   bcopy (BEG_ADDR, *str, coding->produced);
6201   coding->src_multibyte
6202     = ! NILP (current_buffer->enable_multibyte_characters);
6203   if (prev != current_buffer)
6204     Fkill_buffer (Fcurrent_buffer ());
6205   set_buffer_internal (cur);
6206   if (! NILP (buffer_to_kill))
6207     Fkill_buffer (buffer_to_kill);
6208 }
6209
6210
6211 Lisp_Object
6212 decode_coding_string (str, coding, nocopy)
6213      Lisp_Object str;
6214      struct coding_system *coding;
6215      int nocopy;
6216 {
6217   int len;
6218   struct conversion_buffer buf;
6219   int from, to_byte;
6220   Lisp_Object saved_coding_symbol;
6221   int result;
6222   int require_decoding;
6223   int shrinked_bytes = 0;
6224   Lisp_Object newstr;
6225   int consumed, consumed_char, produced, produced_char;
6226
6227   from = 0;
6228   to_byte = SBYTES (str);
6229
6230   saved_coding_symbol = coding->symbol;
6231   coding->src_multibyte = STRING_MULTIBYTE (str);
6232   coding->dst_multibyte = 1;
6233   coding->heading_ascii = 0;
6234
6235   if (CODING_REQUIRE_DETECTION (coding))
6236     {
6237       /* See the comments in code_convert_region.  */
6238       if (coding->type == coding_type_undecided)
6239         {
6240           detect_coding (coding, SDATA (str), to_byte);
6241           if (coding->type == coding_type_undecided)
6242             {
6243               coding->type = coding_type_emacs_mule;
6244               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6245               /* As emacs-mule decoder will handle composition, we
6246                  need this setting to allocate coding->cmp_data
6247                  later.  */
6248               coding->composing = COMPOSITION_NO;
6249             }
6250         }
6251       if (coding->eol_type == CODING_EOL_UNDECIDED
6252           && coding->type != coding_type_ccl)
6253         {
6254           saved_coding_symbol = coding->symbol;
6255           detect_eol (coding, SDATA (str), to_byte);
6256           if (coding->eol_type == CODING_EOL_UNDECIDED)
6257             coding->eol_type = CODING_EOL_LF;
6258           /* We had better recover the original eol format if we
6259              encounter an inconsistent eol format while decoding.  */
6260           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6261         }
6262     }
6263
6264   if (coding->type == coding_type_no_conversion
6265       || coding->type == coding_type_raw_text)
6266     coding->dst_multibyte = 0;
6267
6268   require_decoding = CODING_REQUIRE_DECODING (coding);
6269
6270   if (STRING_MULTIBYTE (str))
6271     {
6272       /* Decoding routines expect the source text to be unibyte.  */
6273       str = Fstring_as_unibyte (str);
6274       to_byte = SBYTES (str);
6275       nocopy = 1;
6276       coding->src_multibyte = 0;
6277     }
6278
6279   /* Try to skip the heading and tailing ASCIIs.  */
6280   if (require_decoding && coding->type != coding_type_ccl)
6281     {
6282       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6283                                 0);
6284       if (from == to_byte)
6285         require_decoding = 0;
6286       shrinked_bytes = from + (SBYTES (str) - to_byte);
6287     }
6288
6289   if (!require_decoding
6290       && !(SYMBOLP (coding->post_read_conversion)
6291            && !NILP (Ffboundp (coding->post_read_conversion))))
6292     {
6293       coding->consumed = SBYTES (str);
6294       coding->consumed_char = SCHARS (str);
6295       if (coding->dst_multibyte)
6296         {
6297           str = Fstring_as_multibyte (str);
6298           nocopy = 1;
6299         }
6300       coding->produced = SBYTES (str);
6301       coding->produced_char = SCHARS (str);
6302       return (nocopy ? str : Fcopy_sequence (str));
6303     }
6304
6305   if (coding->composing != COMPOSITION_DISABLED)
6306     coding_allocate_composition_data (coding, from);
6307   len = decoding_buffer_size (coding, to_byte - from);
6308   allocate_conversion_buffer (buf, len);
6309
6310   consumed = consumed_char = produced = produced_char = 0;
6311   while (1)
6312     {
6313       result = decode_coding (coding, SDATA (str) + from + consumed,
6314                               buf.data + produced, to_byte - from - consumed,
6315                               buf.size - produced);
6316       consumed += coding->consumed;
6317       consumed_char += coding->consumed_char;
6318       produced += coding->produced;
6319       produced_char += coding->produced_char;
6320       if (result == CODING_FINISH_NORMAL
6321           || result == CODING_FINISH_INTERRUPT
6322           || (result == CODING_FINISH_INSUFFICIENT_SRC
6323               && coding->consumed == 0))
6324         break;
6325       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6326         coding_allocate_composition_data (coding, from + produced_char);
6327       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6328         extend_conversion_buffer (&buf);
6329       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6330         {
6331           Lisp_Object eol_type;
6332
6333           /* Recover the original EOL format.  */
6334           if (coding->eol_type == CODING_EOL_CR)
6335             {
6336               unsigned char *p;
6337               for (p = buf.data; p < buf.data + produced; p++)
6338                 if (*p == '\n') *p = '\r';
6339             }
6340           else if (coding->eol_type == CODING_EOL_CRLF)
6341             {
6342               int num_eol = 0;
6343               unsigned char *p0, *p1;
6344               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6345                 if (*p0 == '\n') num_eol++;
6346               if (produced + num_eol >= buf.size)
6347                 extend_conversion_buffer (&buf);
6348               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6349                 {
6350                   *--p1 = *--p0;
6351                   if (*p0 == '\n') *--p1 = '\r';
6352                 }
6353               produced += num_eol;
6354               produced_char += num_eol;
6355             }
6356           /* Suppress eol-format conversion in the further conversion.  */
6357           coding->eol_type = CODING_EOL_LF;
6358
6359           /* Set the coding system symbol to that for Unix-like EOL.  */
6360           eol_type = Fget (saved_coding_symbol, Qeol_type);
6361           if (VECTORP (eol_type)
6362               && XVECTOR (eol_type)->size == 3
6363               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6364             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6365           else
6366             coding->symbol = saved_coding_symbol;
6367
6368
6369         }
6370     }
6371
6372   coding->consumed = consumed;
6373   coding->consumed_char = consumed_char;
6374   coding->produced = produced;
6375   coding->produced_char = produced_char;
6376
6377   if (coding->dst_multibyte)
6378     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6379                                            produced + shrinked_bytes);
6380   else
6381     newstr = make_uninit_string (produced + shrinked_bytes);
6382   if (from > 0)
6383     STRING_COPYIN (newstr, 0, SDATA (str), from);
6384   STRING_COPYIN (newstr, from, buf.data, produced);
6385   if (shrinked_bytes > from)
6386     STRING_COPYIN (newstr, from + produced,
6387                    SDATA (str) + to_byte,
6388                    shrinked_bytes - from);
6389   free_conversion_buffer (&buf);
6390
6391   coding->consumed += shrinked_bytes;
6392   coding->consumed_char += shrinked_bytes;
6393   coding->produced += shrinked_bytes;
6394   coding->produced_char += shrinked_bytes;
6395
6396   if (coding->cmp_data && coding->cmp_data->used)
6397     coding_restore_composition (coding, newstr);
6398   coding_free_composition_data (coding);
6399
6400   if (SYMBOLP (coding->post_read_conversion)
6401       && !NILP (Ffboundp (coding->post_read_conversion)))
6402     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6403
6404   return newstr;
6405 }
6406
6407 Lisp_Object
6408 encode_coding_string (str, coding, nocopy)
6409      Lisp_Object str;
6410      struct coding_system *coding;
6411      int nocopy;
6412 {
6413   int len;
6414   struct conversion_buffer buf;
6415   int from, to, to_byte;
6416   int result;
6417   int shrinked_bytes = 0;
6418   Lisp_Object newstr;
6419   int consumed, consumed_char, produced, produced_char;
6420
6421   if (SYMBOLP (coding->pre_write_conversion)
6422       && !NILP (Ffboundp (coding->pre_write_conversion)))
6423     {
6424       str = run_pre_post_conversion_on_str (str, coding, 1);
6425       /* As STR is just newly generated, we don't have to copy it
6426          anymore.  */
6427       nocopy = 1;
6428     }
6429
6430   from = 0;
6431   to = SCHARS (str);
6432   to_byte = SBYTES (str);
6433
6434   /* Encoding routines determine the multibyteness of the source text
6435      by coding->src_multibyte.  */
6436   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6437   coding->dst_multibyte = 0;
6438   if (! CODING_REQUIRE_ENCODING (coding))
6439     goto no_need_of_encoding;
6440
6441   if (coding->composing != COMPOSITION_DISABLED)
6442     coding_save_composition (coding, from, to, str);
6443
6444   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6445      if we must run CCL program or there are compositions to
6446      encode.  */
6447   coding->heading_ascii = 0;
6448   if (coding->type != coding_type_ccl
6449       && (! coding->cmp_data || coding->cmp_data->used == 0))
6450     {
6451       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6452                                 1);
6453       if (from == to_byte)
6454         {
6455           coding_free_composition_data (coding);
6456           goto no_need_of_encoding;
6457         }
6458       shrinked_bytes = from + (SBYTES (str) - to_byte);
6459     }
6460
6461   len = encoding_buffer_size (coding, to_byte - from);
6462   allocate_conversion_buffer (buf, len);
6463
6464   consumed = consumed_char = produced = produced_char = 0;
6465   while (1)
6466     {
6467       result = encode_coding (coding, SDATA (str) + from + consumed,
6468                               buf.data + produced, to_byte - from - consumed,
6469                               buf.size - produced);
6470       consumed += coding->consumed;
6471       consumed_char += coding->consumed_char;
6472       produced += coding->produced;
6473       produced_char += coding->produced_char;
6474       if (result == CODING_FINISH_NORMAL
6475           || result == CODING_FINISH_INTERRUPT
6476           || (result == CODING_FINISH_INSUFFICIENT_SRC
6477               && coding->consumed == 0))
6478         break;
6479       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6480       extend_conversion_buffer (&buf);
6481     }
6482
6483   coding->consumed = consumed;
6484   coding->consumed_char = consumed_char;
6485   coding->produced = produced;
6486   coding->produced_char = produced_char;
6487
6488   newstr = make_uninit_string (produced + shrinked_bytes);
6489   if (from > 0)
6490     STRING_COPYIN (newstr, 0, SDATA (str), from);
6491   STRING_COPYIN (newstr, from, buf.data, produced);
6492   if (shrinked_bytes > from)
6493     STRING_COPYIN (newstr, from + produced,
6494                    SDATA (str) + to_byte,
6495                    shrinked_bytes - from);
6496
6497   free_conversion_buffer (&buf);
6498   coding_free_composition_data (coding);
6499
6500   return newstr;
6501
6502  no_need_of_encoding:
6503   coding->consumed = SBYTES (str);
6504   coding->consumed_char = SCHARS (str);
6505   if (STRING_MULTIBYTE (str))
6506     {
6507       if (nocopy)
6508         /* We are sure that STR doesn't contain a multibyte
6509            character.  */
6510         STRING_SET_UNIBYTE (str);
6511       else
6512         {
6513           str = Fstring_as_unibyte (str);
6514           nocopy = 1;
6515         }
6516     }
6517   coding->produced = SBYTES (str);
6518   coding->produced_char = SCHARS (str);
6519   return (nocopy ? str : Fcopy_sequence (str));
6520 }
6521
6522 \f
6523 #ifdef emacs
6524 /*** 8. Emacs Lisp library functions ***/
6525
6526 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6527        doc: /* Return t if OBJECT is nil or a coding-system.
6528 See the documentation of `make-coding-system' for information
6529 about coding-system objects.  */)
6530      (obj)
6531      Lisp_Object obj;
6532 {
6533   if (NILP (obj))
6534     return Qt;
6535   if (!SYMBOLP (obj))
6536     return Qnil;
6537   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6538     return Qt;
6539   /* Get coding-spec vector for OBJ.  */
6540   obj = Fget (obj, Qcoding_system);
6541   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6542           ? Qt : Qnil);
6543 }
6544
6545 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6546        Sread_non_nil_coding_system, 1, 1, 0,
6547        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6548      (prompt)
6549      Lisp_Object prompt;
6550 {
6551   Lisp_Object val;
6552   do
6553     {
6554       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6555                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6556     }
6557   while (SCHARS (val) == 0);
6558   return (Fintern (val, Qnil));
6559 }
6560
6561 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6562        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6563 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
6564 Ignores case when completing coding systems (all Emacs coding systems
6565 are lower-case).  */)
6566      (prompt, default_coding_system)
6567      Lisp_Object prompt, default_coding_system;
6568 {
6569   Lisp_Object val;
6570   int count = SPECPDL_INDEX ();
6571
6572   if (SYMBOLP (default_coding_system))
6573     default_coding_system = SYMBOL_NAME (default_coding_system);
6574   specbind (Qcompletion_ignore_case, Qt);
6575   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6576                           Qt, Qnil, Qcoding_system_history,
6577                           default_coding_system, Qnil);
6578   unbind_to (count, Qnil);
6579   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6580 }
6581
6582 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6583        1, 1, 0,
6584        doc: /* Check validity of CODING-SYSTEM.
6585 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6586 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6587 The value of this property should be a vector of length 5.  */)
6588      (coding_system)
6589      Lisp_Object coding_system;
6590 {
6591   Lisp_Object define_form;
6592
6593   define_form = Fget (coding_system, Qcoding_system_define_form);
6594   if (! NILP (define_form))
6595     {
6596       Fput (coding_system, Qcoding_system_define_form, Qnil);
6597       safe_eval (define_form);
6598     }
6599   if (!NILP (Fcoding_system_p (coding_system)))
6600     return coding_system;
6601   xsignal1 (Qcoding_system_error, coding_system);
6602 }
6603 \f
6604 Lisp_Object
6605 detect_coding_system (src, src_bytes, highest, multibytep)
6606      const unsigned char *src;
6607      int src_bytes, highest;
6608      int multibytep;
6609 {
6610   int coding_mask, eol_type;
6611   Lisp_Object val, tmp;
6612   int dummy;
6613
6614   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6615   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6616   if (eol_type == CODING_EOL_INCONSISTENT)
6617     eol_type = CODING_EOL_UNDECIDED;
6618
6619   if (!coding_mask)
6620     {
6621       val = Qundecided;
6622       if (eol_type != CODING_EOL_UNDECIDED)
6623         {
6624           Lisp_Object val2;
6625           val2 = Fget (Qundecided, Qeol_type);
6626           if (VECTORP (val2))
6627             val = XVECTOR (val2)->contents[eol_type];
6628         }
6629       return (highest ? val : Fcons (val, Qnil));
6630     }
6631
6632   /* At first, gather possible coding systems in VAL.  */
6633   val = Qnil;
6634   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6635     {
6636       Lisp_Object category_val, category_index;
6637
6638       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6639       category_val = Fsymbol_value (XCAR (tmp));
6640       if (!NILP (category_val)
6641           && NATNUMP (category_index)
6642           && (coding_mask & (1 << XFASTINT (category_index))))
6643         {
6644           val = Fcons (category_val, val);
6645           if (highest)
6646             break;
6647         }
6648     }
6649   if (!highest)
6650     val = Fnreverse (val);
6651
6652   /* Then, replace the elements with subsidiary coding systems.  */
6653   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6654     {
6655       if (eol_type != CODING_EOL_UNDECIDED
6656           && eol_type != CODING_EOL_INCONSISTENT)
6657         {
6658           Lisp_Object eol;
6659           eol = Fget (XCAR (tmp), Qeol_type);
6660           if (VECTORP (eol))
6661             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6662         }
6663     }
6664   return (highest ? XCAR (val) : val);
6665 }
6666
6667 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6668        2, 3, 0,
6669        doc: /* Detect how the byte sequence in the region is encoded.
6670 Return a list of possible coding systems used on decoding a byte
6671 sequence containing the bytes in the region between START and END when
6672 the coding system `undecided' is specified.  The list is ordered by
6673 priority decided in the current language environment.
6674
6675 If only ASCII characters are found (except for such ISO-2022 control
6676 characters ISO-2022 as ESC), it returns a list of single element
6677 `undecided' or its subsidiary coding system according to a detected
6678 end-of-line format.
6679
6680 If optional argument HIGHEST is non-nil, return the coding system of
6681 highest priority.  */)
6682      (start, end, highest)
6683      Lisp_Object start, end, highest;
6684 {
6685   int from, to;
6686   int from_byte, to_byte;
6687   int include_anchor_byte = 0;
6688
6689   CHECK_NUMBER_COERCE_MARKER (start);
6690   CHECK_NUMBER_COERCE_MARKER (end);
6691
6692   validate_region (&start, &end);
6693   from = XINT (start), to = XINT (end);
6694   from_byte = CHAR_TO_BYTE (from);
6695   to_byte = CHAR_TO_BYTE (to);
6696
6697   if (from < GPT && to >= GPT)
6698     move_gap_both (to, to_byte);
6699   /* If we an anchor byte `\0' follows the region, we include it in
6700      the detecting source.  Then code detectors can handle the tailing
6701      byte sequence more accurately.
6702
6703      Fix me: This is not a perfect solution.  It is better that we
6704      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6705   */
6706   if (to == Z || (to == GPT && GAP_SIZE > 0))
6707     include_anchor_byte = 1;
6708   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6709                                to_byte - from_byte + include_anchor_byte,
6710                                !NILP (highest),
6711                                !NILP (current_buffer
6712                                       ->enable_multibyte_characters));
6713 }
6714
6715 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6716        1, 2, 0,
6717        doc: /* Detect how the byte sequence in STRING is encoded.
6718 Return a list of possible coding systems used on decoding a byte
6719 sequence containing the bytes in STRING when the coding system
6720 `undecided' is specified.  The list is ordered by priority decided in
6721 the current language environment.
6722
6723 If only ASCII characters are found (except for such ISO-2022 control
6724 characters ISO-2022 as ESC), it returns a list of single element
6725 `undecided' or its subsidiary coding system according to a detected
6726 end-of-line format.
6727
6728 If optional argument HIGHEST is non-nil, return the coding system of
6729 highest priority.  */)
6730      (string, highest)
6731      Lisp_Object string, highest;
6732 {
6733   CHECK_STRING (string);
6734
6735   return detect_coding_system (SDATA (string),
6736                                /* "+ 1" is to include the anchor byte
6737                                   `\0'.  With this, code detectors can
6738                                   handle the tailing bytes more
6739                                   accurately.  */
6740                                SBYTES (string) + 1,
6741                                !NILP (highest),
6742                                STRING_MULTIBYTE (string));
6743 }
6744
6745 /*  Subroutine for Ffind_coding_systems_region_internal.
6746
6747     Return a list of coding systems that safely encode the multibyte
6748     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6749     possible coding systems.  If it is nil, it means that we have not
6750     yet found any coding systems.
6751
6752     WORK_TABLE a char-table of which element is set to t once the
6753     element is looked up.
6754
6755     If a non-ASCII single byte char is found, set
6756     *single_byte_char_found to 1.  */
6757
6758 static Lisp_Object
6759 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6760      unsigned char *p, *pend;
6761      Lisp_Object safe_codings, work_table;
6762      int *single_byte_char_found;
6763 {
6764   int c, len;
6765   Lisp_Object val, ch;
6766   Lisp_Object prev, tail;
6767
6768   if (NILP (safe_codings))
6769     goto done_safe_codings;
6770   while (p < pend)
6771     {
6772       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6773       p += len;
6774       if (ASCII_BYTE_P (c))
6775         /* We can ignore ASCII characters here.  */
6776         continue;
6777       if (SINGLE_BYTE_CHAR_P (c))
6778         *single_byte_char_found = 1;
6779       /* Check the safe coding systems for C.  */
6780       ch = make_number (c);
6781       val = Faref (work_table, ch);
6782       if (EQ (val, Qt))
6783         /* This element was already checked.  Ignore it.  */
6784         continue;
6785       /* Remember that we checked this element.  */
6786       Faset (work_table, ch, Qt);
6787
6788       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6789         {
6790           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6791           int encodable;
6792
6793           elt = XCAR (tail);
6794           if (CONSP (XCDR (elt)))
6795             {
6796               /* This entry has this format now:
6797                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6798                           ACCEPT-LATIN-EXTRA ) */
6799               val = XCDR (elt);
6800               encodable = ! NILP (Faref (XCAR (val), ch));
6801               if (! encodable)
6802                 {
6803                   val = XCDR (val);
6804                   translation_table = XCAR (val);
6805                   hash_table = XCAR (XCDR (val));
6806                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6807                 }
6808             }
6809           else
6810             {
6811               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6812               encodable = ! NILP (Faref (XCDR (elt), ch));
6813               if (! encodable)
6814                 {
6815                   /* Transform the format to:
6816                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6817                        ACCEPT-LATIN-EXTRA )  */
6818                   val = Fget (XCAR (elt), Qcoding_system);
6819                   translation_table
6820                     = Fplist_get (AREF (val, 3),
6821                                   Qtranslation_table_for_encode);
6822                   if (SYMBOLP (translation_table))
6823                     translation_table = Fget (translation_table,
6824                                               Qtranslation_table);
6825                   hash_table
6826                     = (CHAR_TABLE_P (translation_table)
6827                        ? XCHAR_TABLE (translation_table)->extras[1]
6828                        : Qnil);
6829                   accept_latin_extra
6830                     = ((EQ (AREF (val, 0), make_number (2))
6831                         && VECTORP (AREF (val, 4)))
6832                        ? AREF (AREF (val, 4), 16)
6833                        : Qnil);
6834                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6835                                         translation_table, hash_table,
6836                                         accept_latin_extra));
6837                 }
6838             }
6839
6840           if (! encodable
6841               && ((CHAR_TABLE_P (translation_table)
6842                    && ! NILP (Faref (translation_table, ch)))
6843                   || (HASH_TABLE_P (hash_table)
6844                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6845                   || (SINGLE_BYTE_CHAR_P (c)
6846                       && ! NILP (accept_latin_extra)
6847                       && VECTORP (Vlatin_extra_code_table)
6848                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6849             encodable = 1;
6850           if (encodable)
6851             prev = tail;
6852           else
6853             {
6854               /* Exclude this coding system from SAFE_CODINGS.  */
6855               if (EQ (tail, safe_codings))
6856                 {
6857                   safe_codings = XCDR (safe_codings);
6858                   if (NILP (safe_codings))
6859                     goto done_safe_codings;
6860                 }
6861               else
6862                 XSETCDR (prev, XCDR (tail));
6863             }
6864         }
6865     }
6866
6867  done_safe_codings:
6868   /* If the above loop was terminated before P reaches PEND, it means
6869      SAFE_CODINGS was set to nil.  If we have not yet found an
6870      non-ASCII single-byte char, check it now.  */
6871   if (! *single_byte_char_found)
6872     while (p < pend)
6873       {
6874         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6875         p += len;
6876         if (! ASCII_BYTE_P (c)
6877             && SINGLE_BYTE_CHAR_P (c))
6878           {
6879             *single_byte_char_found = 1;
6880             break;
6881           }
6882       }
6883   return safe_codings;
6884 }
6885
6886 DEFUN ("find-coding-systems-region-internal",
6887        Ffind_coding_systems_region_internal,
6888        Sfind_coding_systems_region_internal, 2, 2, 0,
6889        doc: /* Internal use only.  */)
6890      (start, end)
6891      Lisp_Object start, end;
6892 {
6893   Lisp_Object work_table, safe_codings;
6894   int non_ascii_p = 0;
6895   int single_byte_char_found = 0;
6896   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6897
6898   if (STRINGP (start))
6899     {
6900       if (!STRING_MULTIBYTE (start))
6901         return Qt;
6902       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6903       p2 = p2end = p1end;
6904       if (SCHARS (start) != SBYTES (start))
6905         non_ascii_p = 1;
6906     }
6907   else
6908     {
6909       int from, to, stop;
6910
6911       CHECK_NUMBER_COERCE_MARKER (start);
6912       CHECK_NUMBER_COERCE_MARKER (end);
6913       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6914         args_out_of_range (start, end);
6915       if (NILP (current_buffer->enable_multibyte_characters))
6916         return Qt;
6917       from = CHAR_TO_BYTE (XINT (start));
6918       to = CHAR_TO_BYTE (XINT (end));
6919       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6920       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6921       if (stop == to)
6922         p2 = p2end = p1end;
6923       else
6924         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6925       if (XINT (end) - XINT (start) != to - from)
6926         non_ascii_p = 1;
6927     }
6928
6929   if (!non_ascii_p)
6930     {
6931       /* We are sure that the text contains no multibyte character.
6932          Check if it contains eight-bit-graphic.  */
6933       p = p1;
6934       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6935       if (p == p1end)
6936         {
6937           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6938           if (p == p2end)
6939             return Qt;
6940         }
6941     }
6942
6943   /* The text contains non-ASCII characters.  */
6944
6945   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6946   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6947
6948   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6949                                     &single_byte_char_found);
6950   if (p2 < p2end)
6951     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6952                                       &single_byte_char_found);
6953   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6954     safe_codings = Qt;
6955   else
6956     {
6957       /* Turn safe_codings to a list of coding systems... */
6958       Lisp_Object val;
6959
6960       if (single_byte_char_found)
6961         /* ... and append these for eight-bit chars.  */
6962         val = Fcons (Qraw_text,
6963                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6964       else
6965         /* ... and append generic coding systems.  */
6966         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6967
6968       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6969         val = Fcons (XCAR (XCAR (safe_codings)), val);
6970       safe_codings = val;
6971     }
6972
6973   return safe_codings;
6974 }
6975
6976
6977 /* Search from position POS for such characters that are unencodable
6978    accoding to SAFE_CHARS, and return a list of their positions.  P
6979    points where in the memory the character at POS exists.  Limit the
6980    search at PEND or when Nth unencodable characters are found.
6981
6982    If SAFE_CHARS is a char table, an element for an unencodable
6983    character is nil.
6984
6985    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6986
6987    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6988    eight-bit-graphic characters are unencodable.  */
6989
6990 static Lisp_Object
6991 unencodable_char_position (safe_chars, pos, p, pend, n)
6992      Lisp_Object safe_chars;
6993      int pos;
6994      unsigned char *p, *pend;
6995      int n;
6996 {
6997   Lisp_Object pos_list;
6998
6999   pos_list = Qnil;
7000   while (p < pend)
7001     {
7002       int len;
7003       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7004
7005       if (c >= 128
7006           && (CHAR_TABLE_P (safe_chars)
7007               ? NILP (CHAR_TABLE_REF (safe_chars, c))
7008               : (NILP (safe_chars) || c < 256)))
7009         {
7010           pos_list = Fcons (make_number (pos), pos_list);
7011           if (--n <= 0)
7012             break;
7013         }
7014       pos++;
7015       p += len;
7016     }
7017   return Fnreverse (pos_list);
7018 }
7019
7020
7021 DEFUN ("unencodable-char-position", Funencodable_char_position,
7022        Sunencodable_char_position, 3, 5, 0,
7023        doc: /*
7024 Return position of first un-encodable character in a region.
7025 START and END specfiy the region and CODING-SYSTEM specifies the
7026 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7027
7028 If optional 4th argument COUNT is non-nil, it specifies at most how
7029 many un-encodable characters to search.  In this case, the value is a
7030 list of positions.
7031
7032 If optional 5th argument STRING is non-nil, it is a string to search
7033 for un-encodable characters.  In that case, START and END are indexes
7034 to the string.  */)
7035      (start, end, coding_system, count, string)
7036      Lisp_Object start, end, coding_system, count, string;
7037 {
7038   int n;
7039   Lisp_Object safe_chars;
7040   struct coding_system coding;
7041   Lisp_Object positions;
7042   int from, to;
7043   unsigned char *p, *pend;
7044
7045   if (NILP (string))
7046     {
7047       validate_region (&start, &end);
7048       from = XINT (start);
7049       to = XINT (end);
7050       if (NILP (current_buffer->enable_multibyte_characters))
7051         return Qnil;
7052       p = CHAR_POS_ADDR (from);
7053       if (to == GPT)
7054         pend = GPT_ADDR;
7055       else
7056         pend = CHAR_POS_ADDR (to);
7057     }
7058   else
7059     {
7060       CHECK_STRING (string);
7061       CHECK_NATNUM (start);
7062       CHECK_NATNUM (end);
7063       from = XINT (start);
7064       to = XINT (end);
7065       if (from > to
7066           || to > SCHARS (string))
7067         args_out_of_range_3 (string, start, end);
7068       if (! STRING_MULTIBYTE (string))
7069         return Qnil;
7070       p = SDATA (string) + string_char_to_byte (string, from);
7071       pend = SDATA (string) + string_char_to_byte (string, to);
7072     }
7073
7074   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7075
7076   if (NILP (count))
7077     n = 1;
7078   else
7079     {
7080       CHECK_NATNUM (count);
7081       n = XINT (count);
7082     }
7083
7084   if (coding.type == coding_type_no_conversion
7085       || coding.type == coding_type_raw_text)
7086     return Qnil;
7087
7088   if (coding.type == coding_type_undecided)
7089     safe_chars = Qnil;
7090   else
7091     safe_chars = coding_safe_chars (coding_system);
7092
7093   if (STRINGP (string)
7094       || from >= GPT || to <= GPT)
7095     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7096   else
7097     {
7098       Lisp_Object args[2];
7099
7100       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7101       n -= XINT (Flength (args[0]));
7102       if (n <= 0)
7103         positions = args[0];
7104       else
7105         {
7106           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7107                                                pend, n);
7108           positions = Fappend (2, args);
7109         }
7110     }
7111
7112   return  (NILP (count) ? Fcar (positions) : positions);
7113 }
7114
7115
7116 Lisp_Object
7117 code_convert_region1 (start, end, coding_system, encodep)
7118      Lisp_Object start, end, coding_system;
7119      int encodep;
7120 {
7121   struct coding_system coding;
7122   int from, to;
7123
7124   CHECK_NUMBER_COERCE_MARKER (start);
7125   CHECK_NUMBER_COERCE_MARKER (end);
7126   CHECK_SYMBOL (coding_system);
7127
7128   validate_region (&start, &end);
7129   from = XFASTINT (start);
7130   to = XFASTINT (end);
7131
7132   if (NILP (coding_system))
7133     return make_number (to - from);
7134
7135   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7136     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7137
7138   coding.mode |= CODING_MODE_LAST_BLOCK;
7139   coding.src_multibyte = coding.dst_multibyte
7140     = !NILP (current_buffer->enable_multibyte_characters);
7141   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7142                        &coding, encodep, 1);
7143   Vlast_coding_system_used = coding.symbol;
7144   return make_number (coding.produced_char);
7145 }
7146
7147 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7148        3, 3, "r\nzCoding system: ",
7149        doc: /* Decode the current region from the specified coding system.
7150 When called from a program, takes three arguments:
7151 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7152 This function sets `last-coding-system-used' to the precise coding system
7153 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7154 not fully specified.)
7155 It returns the length of the decoded text.  */)
7156      (start, end, coding_system)
7157      Lisp_Object start, end, coding_system;
7158 {
7159   return code_convert_region1 (start, end, coding_system, 0);
7160 }
7161
7162 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7163        3, 3, "r\nzCoding system: ",
7164        doc: /* Encode the current region into the specified coding system.
7165 When called from a program, takes three arguments:
7166 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7167 This function sets `last-coding-system-used' to the precise coding system
7168 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7169 not fully specified.)
7170 It returns the length of the encoded text.  */)
7171      (start, end, coding_system)
7172      Lisp_Object start, end, coding_system;
7173 {
7174   return code_convert_region1 (start, end, coding_system, 1);
7175 }
7176
7177 Lisp_Object
7178 code_convert_string1 (string, coding_system, nocopy, encodep)
7179      Lisp_Object string, coding_system, nocopy;
7180      int encodep;
7181 {
7182   struct coding_system coding;
7183
7184   CHECK_STRING (string);
7185   CHECK_SYMBOL (coding_system);
7186
7187   if (NILP (coding_system))
7188     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7189
7190   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7191     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7192
7193   coding.mode |= CODING_MODE_LAST_BLOCK;
7194   string = (encodep
7195             ? encode_coding_string (string, &coding, !NILP (nocopy))
7196             : decode_coding_string (string, &coding, !NILP (nocopy)));
7197   Vlast_coding_system_used = coding.symbol;
7198
7199   return string;
7200 }
7201
7202 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7203        2, 3, 0,
7204        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7205 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7206 if the decoding operation is trivial.
7207 This function sets `last-coding-system-used' to the precise coding system
7208 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7209 not fully specified.)  */)
7210      (string, coding_system, nocopy)
7211      Lisp_Object string, coding_system, nocopy;
7212 {
7213   return code_convert_string1 (string, coding_system, nocopy, 0);
7214 }
7215
7216 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7217        2, 3, 0,
7218        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7219 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7220 if the encoding operation is trivial.
7221 This function sets `last-coding-system-used' to the precise coding system
7222 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7223 not fully specified.)  */)
7224      (string, coding_system, nocopy)
7225      Lisp_Object string, coding_system, nocopy;
7226 {
7227   return code_convert_string1 (string, coding_system, nocopy, 1);
7228 }
7229
7230 /* Encode or decode STRING according to CODING_SYSTEM.
7231    Do not set Vlast_coding_system_used.
7232
7233    This function is called only from macros DECODE_FILE and
7234    ENCODE_FILE, thus we ignore character composition.  */
7235
7236 Lisp_Object
7237 code_convert_string_norecord (string, coding_system, encodep)
7238      Lisp_Object string, coding_system;
7239      int encodep;
7240 {
7241   struct coding_system coding;
7242
7243   CHECK_STRING (string);
7244   CHECK_SYMBOL (coding_system);
7245
7246   if (NILP (coding_system))
7247     return string;
7248
7249   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7250     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7251
7252   coding.composing = COMPOSITION_DISABLED;
7253   coding.mode |= CODING_MODE_LAST_BLOCK;
7254   return (encodep
7255           ? encode_coding_string (string, &coding, 1)
7256           : decode_coding_string (string, &coding, 1));
7257 }
7258 \f
7259 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7260        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7261 Return the corresponding character.  */)
7262      (code)
7263      Lisp_Object code;
7264 {
7265   unsigned char c1, c2, s1, s2;
7266   Lisp_Object val;
7267
7268   CHECK_NUMBER (code);
7269   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7270   if (s1 == 0)
7271     {
7272       if (s2 < 0x80)
7273         XSETFASTINT (val, s2);
7274       else if (s2 >= 0xA0 || s2 <= 0xDF)
7275         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7276       else
7277         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7278     }
7279   else
7280     {
7281       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7282           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7283         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7284       DECODE_SJIS (s1, s2, c1, c2);
7285       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7286     }
7287   return val;
7288 }
7289
7290 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7291        doc: /* Encode a Japanese character CH to shift_jis encoding.
7292 Return the corresponding code in SJIS.  */)
7293      (ch)
7294      Lisp_Object ch;
7295 {
7296   int charset, c1, c2, s1, s2;
7297   Lisp_Object val;
7298
7299   CHECK_NUMBER (ch);
7300   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7301   if (charset == CHARSET_ASCII)
7302     {
7303       val = ch;
7304     }
7305   else if (charset == charset_jisx0208
7306            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7307     {
7308       ENCODE_SJIS (c1, c2, s1, s2);
7309       XSETFASTINT (val, (s1 << 8) | s2);
7310     }
7311   else if (charset == charset_katakana_jisx0201
7312            && c1 > 0x20 && c2 < 0xE0)
7313     {
7314       XSETFASTINT (val, c1 | 0x80);
7315     }
7316   else
7317     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7318   return val;
7319 }
7320
7321 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7322        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7323 Return the corresponding character.  */)
7324      (code)
7325      Lisp_Object code;
7326 {
7327   int charset;
7328   unsigned char b1, b2, c1, c2;
7329   Lisp_Object val;
7330
7331   CHECK_NUMBER (code);
7332   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7333   if (b1 == 0)
7334     {
7335       if (b2 >= 0x80)
7336         error ("Invalid BIG5 code: %x", XFASTINT (code));
7337       val = code;
7338     }
7339   else
7340     {
7341       if ((b1 < 0xA1 || b1 > 0xFE)
7342           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7343         error ("Invalid BIG5 code: %x", XFASTINT (code));
7344       DECODE_BIG5 (b1, b2, charset, c1, c2);
7345       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7346     }
7347   return val;
7348 }
7349
7350 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7351        doc: /* Encode the Big5 character CH to BIG5 coding system.
7352 Return the corresponding character code in Big5.  */)
7353      (ch)
7354      Lisp_Object ch;
7355 {
7356   int charset, c1, c2, b1, b2;
7357   Lisp_Object val;
7358
7359   CHECK_NUMBER (ch);
7360   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7361   if (charset == CHARSET_ASCII)
7362     {
7363       val = ch;
7364     }
7365   else if ((charset == charset_big5_1
7366             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7367            || (charset == charset_big5_2
7368                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7369     {
7370       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7371       XSETFASTINT (val, (b1 << 8) | b2);
7372     }
7373   else
7374     error ("Can't encode to Big5: %d", XFASTINT (ch));
7375   return val;
7376 }
7377 \f
7378 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7379        Sset_terminal_coding_system_internal, 1, 1, 0,
7380        doc: /* Internal use only.  */)
7381      (coding_system)
7382      Lisp_Object coding_system;
7383 {
7384   CHECK_SYMBOL (coding_system);
7385   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7386   /* We had better not send unsafe characters to terminal.  */
7387   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7388   /* Character composition should be disabled.  */
7389   terminal_coding.composing = COMPOSITION_DISABLED;
7390   /* Error notification should be suppressed.  */
7391   terminal_coding.suppress_error = 1;
7392   terminal_coding.src_multibyte = 1;
7393   terminal_coding.dst_multibyte = 0;
7394   return Qnil;
7395 }
7396
7397 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7398        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7399        doc: /* Internal use only.  */)
7400      (coding_system)
7401      Lisp_Object coding_system;
7402 {
7403   CHECK_SYMBOL (coding_system);
7404   setup_coding_system (Fcheck_coding_system (coding_system),
7405                        &safe_terminal_coding);
7406   /* Character composition should be disabled.  */
7407   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7408   /* Error notification should be suppressed.  */
7409   safe_terminal_coding.suppress_error = 1;
7410   safe_terminal_coding.src_multibyte = 1;
7411   safe_terminal_coding.dst_multibyte = 0;
7412   return Qnil;
7413 }
7414
7415 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7416        Sterminal_coding_system, 0, 0, 0,
7417        doc: /* Return coding system specified for terminal output.  */)
7418      ()
7419 {
7420   return terminal_coding.symbol;
7421 }
7422
7423 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7424        Sset_keyboard_coding_system_internal, 1, 1, 0,
7425        doc: /* Internal use only.  */)
7426      (coding_system)
7427      Lisp_Object coding_system;
7428 {
7429   CHECK_SYMBOL (coding_system);
7430   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7431   /* Character composition should be disabled.  */
7432   keyboard_coding.composing = COMPOSITION_DISABLED;
7433   return Qnil;
7434 }
7435
7436 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7437        Skeyboard_coding_system, 0, 0, 0,
7438        doc: /* Return coding system specified for decoding keyboard input.  */)
7439      ()
7440 {
7441   return keyboard_coding.symbol;
7442 }
7443
7444 \f
7445 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7446        Sfind_operation_coding_system,  1, MANY, 0,
7447        doc: /* Choose a coding system for an operation based on the target name.
7448 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7449 DECODING-SYSTEM is the coding system to use for decoding
7450 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7451 for encoding (in case OPERATION does encoding).
7452
7453 The first argument OPERATION specifies an I/O primitive:
7454   For file I/O, `insert-file-contents' or `write-region'.
7455   For process I/O, `call-process', `call-process-region', or `start-process'.
7456   For network I/O, `open-network-stream'.
7457
7458 The remaining arguments should be the same arguments that were passed
7459 to the primitive.  Depending on which primitive, one of those arguments
7460 is selected as the TARGET.  For example, if OPERATION does file I/O,
7461 whichever argument specifies the file name is TARGET.
7462
7463 TARGET has a meaning which depends on OPERATION:
7464   For file I/O, TARGET is a file name (except for the special case below).
7465   For process I/O, TARGET is a process name.
7466   For network I/O, TARGET is a service name or a port number
7467
7468 This function looks up what specified for TARGET in,
7469 `file-coding-system-alist', `process-coding-system-alist',
7470 or `network-coding-system-alist' depending on OPERATION.
7471 They may specify a coding system, a cons of coding systems,
7472 or a function symbol to call.
7473 In the last case, we call the function with one argument,
7474 which is a list of all the arguments given to this function.
7475
7476 If OPERATION is `insert-file-contents', the argument corresponding to
7477 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
7478 file name to look up, and BUFFER is a buffer that contains the file's
7479 contents (not yet decoded).  If `file-coding-system-alist' specifies a
7480 function to call for FILENAME, that function should examine the
7481 contents of BUFFER instead of reading the file.
7482
7483 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
7484      (nargs, args)
7485      int nargs;
7486      Lisp_Object *args;
7487 {
7488   Lisp_Object operation, target_idx, target, val;
7489   register Lisp_Object chain;
7490
7491   if (nargs < 2)
7492     error ("Too few arguments");
7493   operation = args[0];
7494   if (!SYMBOLP (operation)
7495       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7496     error ("Invalid first argument");
7497   if (nargs < 1 + XINT (target_idx))
7498     error ("Too few arguments for operation: %s",
7499            SDATA (SYMBOL_NAME (operation)));
7500   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7501      argument to write-region) is string, it must be treated as a
7502      target file name.  */
7503   if (EQ (operation, Qwrite_region)
7504       && nargs > 5
7505       && STRINGP (args[5]))
7506     target_idx = make_number (4);
7507   target = args[XINT (target_idx) + 1];
7508   if (!(STRINGP (target)
7509         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7510             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7511         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7512     error ("Invalid argument %d", XINT (target_idx) + 1);
7513   if (CONSP (target))
7514     target = XCAR (target);
7515
7516   chain = ((EQ (operation, Qinsert_file_contents)
7517             || EQ (operation, Qwrite_region))
7518            ? Vfile_coding_system_alist
7519            : (EQ (operation, Qopen_network_stream)
7520               ? Vnetwork_coding_system_alist
7521               : Vprocess_coding_system_alist));
7522   if (NILP (chain))
7523     return Qnil;
7524
7525   for (; CONSP (chain); chain = XCDR (chain))
7526     {
7527       Lisp_Object elt;
7528       elt = XCAR (chain);
7529
7530       if (CONSP (elt)
7531           && ((STRINGP (target)
7532                && STRINGP (XCAR (elt))
7533                && fast_string_match (XCAR (elt), target) >= 0)
7534               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7535         {
7536           val = XCDR (elt);
7537           /* Here, if VAL is both a valid coding system and a valid
7538              function symbol, we return VAL as a coding system.  */
7539           if (CONSP (val))
7540             return val;
7541           if (! SYMBOLP (val))
7542             return Qnil;
7543           if (! NILP (Fcoding_system_p (val)))
7544             return Fcons (val, val);
7545           if (! NILP (Ffboundp (val)))
7546             {
7547               /* We use call1 rather than safe_call1
7548                  so as to get bug reports about functions called here
7549                  which don't handle the current interface.  */
7550               val = call1 (val, Flist (nargs, args));
7551               if (CONSP (val))
7552                 return val;
7553               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7554                 return Fcons (val, val);
7555             }
7556           return Qnil;
7557         }
7558     }
7559   return Qnil;
7560 }
7561
7562 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7563        Supdate_coding_systems_internal, 0, 0, 0,
7564        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7565 When values of any coding categories are changed, you must
7566 call this function.  */)
7567      ()
7568 {
7569   int i;
7570
7571   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7572     {
7573       Lisp_Object val;
7574
7575       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7576       if (!NILP (val))
7577         {
7578           if (! coding_system_table[i])
7579             coding_system_table[i] = ((struct coding_system *)
7580                                       xmalloc (sizeof (struct coding_system)));
7581           setup_coding_system (val, coding_system_table[i]);
7582         }
7583       else if (coding_system_table[i])
7584         {
7585           xfree (coding_system_table[i]);
7586           coding_system_table[i] = NULL;
7587         }
7588     }
7589
7590   return Qnil;
7591 }
7592
7593 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7594        Sset_coding_priority_internal, 0, 0, 0,
7595        doc: /* Update internal database for the current value of `coding-category-list'.
7596 This function is internal use only.  */)
7597      ()
7598 {
7599   int i = 0, idx;
7600   Lisp_Object val;
7601
7602   val = Vcoding_category_list;
7603
7604   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7605     {
7606       if (! SYMBOLP (XCAR (val)))
7607         break;
7608       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7609       if (idx >= CODING_CATEGORY_IDX_MAX)
7610         break;
7611       coding_priorities[i++] = (1 << idx);
7612       val = XCDR (val);
7613     }
7614   /* If coding-category-list is valid and contains all coding
7615      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7616      the following code saves Emacs from crashing.  */
7617   while (i < CODING_CATEGORY_IDX_MAX)
7618     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7619
7620   return Qnil;
7621 }
7622
7623 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7624        Sdefine_coding_system_internal, 1, 1, 0,
7625        doc: /* Register CODING-SYSTEM as a base coding system.
7626 This function is internal use only.  */)
7627      (coding_system)
7628      Lisp_Object coding_system;
7629 {
7630   Lisp_Object safe_chars, slot;
7631
7632   if (NILP (Fcheck_coding_system (coding_system)))
7633     xsignal1 (Qcoding_system_error, coding_system);
7634
7635   safe_chars = coding_safe_chars (coding_system);
7636   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7637     error ("No valid safe-chars property for %s",
7638            SDATA (SYMBOL_NAME (coding_system)));
7639
7640   if (EQ (safe_chars, Qt))
7641     {
7642       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7643         XSETCAR (Vcoding_system_safe_chars,
7644                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7645     }
7646   else
7647     {
7648       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7649       if (NILP (slot))
7650         XSETCDR (Vcoding_system_safe_chars,
7651                  nconc2 (XCDR (Vcoding_system_safe_chars),
7652                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7653       else
7654         XSETCDR (slot, safe_chars);
7655     }
7656   return Qnil;
7657 }
7658
7659 #endif /* emacs */
7660
7661 \f
7662 /*** 9. Post-amble ***/
7663
7664 void
7665 init_coding_once ()
7666 {
7667   int i;
7668
7669   /* Emacs' internal format specific initialize routine.  */
7670   for (i = 0; i <= 0x20; i++)
7671     emacs_code_class[i] = EMACS_control_code;
7672   emacs_code_class[0x0A] = EMACS_linefeed_code;
7673   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7674   for (i = 0x21 ; i < 0x7F; i++)
7675     emacs_code_class[i] = EMACS_ascii_code;
7676   emacs_code_class[0x7F] = EMACS_control_code;
7677   for (i = 0x80; i < 0xFF; i++)
7678     emacs_code_class[i] = EMACS_invalid_code;
7679   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7680   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7681   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7682   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7683
7684   /* ISO2022 specific initialize routine.  */
7685   for (i = 0; i < 0x20; i++)
7686     iso_code_class[i] = ISO_control_0;
7687   for (i = 0x21; i < 0x7F; i++)
7688     iso_code_class[i] = ISO_graphic_plane_0;
7689   for (i = 0x80; i < 0xA0; i++)
7690     iso_code_class[i] = ISO_control_1;
7691   for (i = 0xA1; i < 0xFF; i++)
7692     iso_code_class[i] = ISO_graphic_plane_1;
7693   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7694   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7695   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7696   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7697   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7698   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7699   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7700   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7701   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7702   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7703
7704   setup_coding_system (Qnil, &keyboard_coding);
7705   setup_coding_system (Qnil, &terminal_coding);
7706   setup_coding_system (Qnil, &safe_terminal_coding);
7707   setup_coding_system (Qnil, &default_buffer_file_coding);
7708
7709   bzero (coding_system_table, sizeof coding_system_table);
7710
7711   bzero (ascii_skip_code, sizeof ascii_skip_code);
7712   for (i = 0; i < 128; i++)
7713     ascii_skip_code[i] = 1;
7714
7715 #if defined (MSDOS) || defined (WINDOWSNT)
7716   system_eol_type = CODING_EOL_CRLF;
7717 #else
7718   system_eol_type = CODING_EOL_LF;
7719 #endif
7720
7721   inhibit_pre_post_conversion = 0;
7722 }
7723
7724 #ifdef emacs
7725
7726 void
7727 syms_of_coding ()
7728 {
7729   staticpro (&Vcode_conversion_workbuf_name);
7730   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7731
7732   Qtarget_idx = intern ("target-idx");
7733   staticpro (&Qtarget_idx);
7734
7735   Qcoding_system_history = intern ("coding-system-history");
7736   staticpro (&Qcoding_system_history);
7737   Fset (Qcoding_system_history, Qnil);
7738
7739   /* Target FILENAME is the first argument.  */
7740   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7741   /* Target FILENAME is the third argument.  */
7742   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7743
7744   Qcall_process = intern ("call-process");
7745   staticpro (&Qcall_process);
7746   /* Target PROGRAM is the first argument.  */
7747   Fput (Qcall_process, Qtarget_idx, make_number (0));
7748
7749   Qcall_process_region = intern ("call-process-region");
7750   staticpro (&Qcall_process_region);
7751   /* Target PROGRAM is the third argument.  */
7752   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7753
7754   Qstart_process = intern ("start-process");
7755   staticpro (&Qstart_process);
7756   /* Target PROGRAM is the third argument.  */
7757   Fput (Qstart_process, Qtarget_idx, make_number (2));
7758
7759   Qopen_network_stream = intern ("open-network-stream");
7760   staticpro (&Qopen_network_stream);
7761   /* Target SERVICE is the fourth argument.  */
7762   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7763
7764   Qcoding_system = intern ("coding-system");
7765   staticpro (&Qcoding_system);
7766
7767   Qeol_type = intern ("eol-type");
7768   staticpro (&Qeol_type);
7769
7770   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7771   staticpro (&Qbuffer_file_coding_system);
7772
7773   Qpost_read_conversion = intern ("post-read-conversion");
7774   staticpro (&Qpost_read_conversion);
7775
7776   Qpre_write_conversion = intern ("pre-write-conversion");
7777   staticpro (&Qpre_write_conversion);
7778
7779   Qno_conversion = intern ("no-conversion");
7780   staticpro (&Qno_conversion);
7781
7782   Qundecided = intern ("undecided");
7783   staticpro (&Qundecided);
7784
7785   Qcoding_system_p = intern ("coding-system-p");
7786   staticpro (&Qcoding_system_p);
7787
7788   Qcoding_system_error = intern ("coding-system-error");
7789   staticpro (&Qcoding_system_error);
7790
7791   Fput (Qcoding_system_error, Qerror_conditions,
7792         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7793   Fput (Qcoding_system_error, Qerror_message,
7794         build_string ("Invalid coding system"));
7795
7796   Qcoding_category = intern ("coding-category");
7797   staticpro (&Qcoding_category);
7798   Qcoding_category_index = intern ("coding-category-index");
7799   staticpro (&Qcoding_category_index);
7800
7801   Vcoding_category_table
7802     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7803   staticpro (&Vcoding_category_table);
7804   {
7805     int i;
7806     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7807       {
7808         XVECTOR (Vcoding_category_table)->contents[i]
7809           = intern (coding_category_name[i]);
7810         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7811               Qcoding_category_index, make_number (i));
7812       }
7813   }
7814
7815   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7816   staticpro (&Vcoding_system_safe_chars);
7817
7818   Qtranslation_table = intern ("translation-table");
7819   staticpro (&Qtranslation_table);
7820   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7821
7822   Qtranslation_table_id = intern ("translation-table-id");
7823   staticpro (&Qtranslation_table_id);
7824
7825   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7826   staticpro (&Qtranslation_table_for_decode);
7827
7828   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7829   staticpro (&Qtranslation_table_for_encode);
7830
7831   Qsafe_chars = intern ("safe-chars");
7832   staticpro (&Qsafe_chars);
7833
7834   Qchar_coding_system = intern ("char-coding-system");
7835   staticpro (&Qchar_coding_system);
7836
7837   /* Intern this now in case it isn't already done.
7838      Setting this variable twice is harmless.
7839      But don't staticpro it here--that is done in alloc.c.  */
7840   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7841   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7842   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7843
7844   Qvalid_codes = intern ("valid-codes");
7845   staticpro (&Qvalid_codes);
7846
7847   Qascii_incompatible = intern ("ascii-incompatible");
7848   staticpro (&Qascii_incompatible);
7849
7850   Qemacs_mule = intern ("emacs-mule");
7851   staticpro (&Qemacs_mule);
7852
7853   Qraw_text = intern ("raw-text");
7854   staticpro (&Qraw_text);
7855
7856   Qutf_8 = intern ("utf-8");
7857   staticpro (&Qutf_8);
7858
7859   Qcoding_system_define_form = intern ("coding-system-define-form");
7860   staticpro (&Qcoding_system_define_form);
7861
7862   defsubr (&Scoding_system_p);
7863   defsubr (&Sread_coding_system);
7864   defsubr (&Sread_non_nil_coding_system);
7865   defsubr (&Scheck_coding_system);
7866   defsubr (&Sdetect_coding_region);
7867   defsubr (&Sdetect_coding_string);
7868   defsubr (&Sfind_coding_systems_region_internal);
7869   defsubr (&Sunencodable_char_position);
7870   defsubr (&Sdecode_coding_region);
7871   defsubr (&Sencode_coding_region);
7872   defsubr (&Sdecode_coding_string);
7873   defsubr (&Sencode_coding_string);
7874   defsubr (&Sdecode_sjis_char);
7875   defsubr (&Sencode_sjis_char);
7876   defsubr (&Sdecode_big5_char);
7877   defsubr (&Sencode_big5_char);
7878   defsubr (&Sset_terminal_coding_system_internal);
7879   defsubr (&Sset_safe_terminal_coding_system_internal);
7880   defsubr (&Sterminal_coding_system);
7881   defsubr (&Sset_keyboard_coding_system_internal);
7882   defsubr (&Skeyboard_coding_system);
7883   defsubr (&Sfind_operation_coding_system);
7884   defsubr (&Supdate_coding_systems_internal);
7885   defsubr (&Sset_coding_priority_internal);
7886   defsubr (&Sdefine_coding_system_internal);
7887
7888   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7889                doc: /* List of coding systems.
7890
7891 Do not alter the value of this variable manually.  This variable should be
7892 updated by the functions `make-coding-system' and
7893 `define-coding-system-alias'.  */);
7894   Vcoding_system_list = Qnil;
7895
7896   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7897                doc: /* Alist of coding system names.
7898 Each element is one element list of coding system name.
7899 This variable is given to `completing-read' as TABLE argument.
7900
7901 Do not alter the value of this variable manually.  This variable should be
7902 updated by the functions `make-coding-system' and
7903 `define-coding-system-alias'.  */);
7904   Vcoding_system_alist = Qnil;
7905
7906   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7907                doc: /* List of coding-categories (symbols) ordered by priority.
7908
7909 On detecting a coding system, Emacs tries code detection algorithms
7910 associated with each coding-category one by one in this order.  When
7911 one algorithm agrees with a byte sequence of source text, the coding
7912 system bound to the corresponding coding-category is selected.
7913
7914 Don't modify this variable directly, but use `set-coding-priority'.  */);
7915   {
7916     int i;
7917
7918     Vcoding_category_list = Qnil;
7919     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7920       Vcoding_category_list
7921         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7922                  Vcoding_category_list);
7923   }
7924
7925   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7926                doc: /* Specify the coding system for read operations.
7927 It is useful to bind this variable with `let', but do not set it globally.
7928 If the value is a coding system, it is used for decoding on read operation.
7929 If not, an appropriate element is used from one of the coding system alists:
7930 There are three such tables, `file-coding-system-alist',
7931 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7932   Vcoding_system_for_read = Qnil;
7933
7934   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7935                doc: /* Specify the coding system for write operations.
7936 Programs bind this variable with `let', but you should not set it globally.
7937 If the value is a coding system, it is used for encoding of output,
7938 when writing it to a file and when sending it to a file or subprocess.
7939
7940 If this does not specify a coding system, an appropriate element
7941 is used from one of the coding system alists:
7942 There are three such tables, `file-coding-system-alist',
7943 `process-coding-system-alist', and `network-coding-system-alist'.
7944 For output to files, if the above procedure does not specify a coding system,
7945 the value of `buffer-file-coding-system' is used.  */);
7946   Vcoding_system_for_write = Qnil;
7947
7948   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7949                doc: /* Coding system used in the latest file or process I/O.
7950 Also set by `encode-coding-region', `decode-coding-region',
7951 `encode-coding-string' and `decode-coding-string'.  */);
7952   Vlast_coding_system_used = Qnil;
7953
7954   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7955                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7956 See info node `Coding Systems' and info node `Text and Binary' concerning
7957 such conversion.  */);
7958   inhibit_eol_conversion = 0;
7959
7960   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7961                doc: /* Non-nil means process buffer inherits coding system of process output.
7962 Bind it to t if the process output is to be treated as if it were a file
7963 read from some filesystem.  */);
7964   inherit_process_coding_system = 0;
7965
7966   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7967                doc: /* Alist to decide a coding system to use for a file I/O operation.
7968 The format is ((PATTERN . VAL) ...),
7969 where PATTERN is a regular expression matching a file name,
7970 VAL is a coding system, a cons of coding systems, or a function symbol.
7971 If VAL is a coding system, it is used for both decoding and encoding
7972 the file contents.
7973 If VAL is a cons of coding systems, the car part is used for decoding,
7974 and the cdr part is used for encoding.
7975 If VAL is a function symbol, the function must return a coding system
7976 or a cons of coding systems which are used as above.  The function is
7977 called with an argument that is a list of the arguments with which
7978 `find-operation-coding-system' was called.
7979
7980 See also the function `find-operation-coding-system'
7981 and the variable `auto-coding-alist'.  */);
7982   Vfile_coding_system_alist = Qnil;
7983
7984   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7985     doc: /* Alist to decide a coding system to use for a process I/O operation.
7986 The format is ((PATTERN . VAL) ...),
7987 where PATTERN is a regular expression matching a program name,
7988 VAL is a coding system, a cons of coding systems, or a function symbol.
7989 If VAL is a coding system, it is used for both decoding what received
7990 from the program and encoding what sent to the program.
7991 If VAL is a cons of coding systems, the car part is used for decoding,
7992 and the cdr part is used for encoding.
7993 If VAL is a function symbol, the function must return a coding system
7994 or a cons of coding systems which are used as above.
7995
7996 See also the function `find-operation-coding-system'.  */);
7997   Vprocess_coding_system_alist = Qnil;
7998
7999   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8000     doc: /* Alist to decide a coding system to use for a network I/O operation.
8001 The format is ((PATTERN . VAL) ...),
8002 where PATTERN is a regular expression matching a network service name
8003 or is a port number to connect to,
8004 VAL is a coding system, a cons of coding systems, or a function symbol.
8005 If VAL is a coding system, it is used for both decoding what received
8006 from the network stream and encoding what sent to the network stream.
8007 If VAL is a cons of coding systems, the car part is used for decoding,
8008 and the cdr part is used for encoding.
8009 If VAL is a function symbol, the function must return a coding system
8010 or a cons of coding systems which are used as above.
8011
8012 See also the function `find-operation-coding-system'.  */);
8013   Vnetwork_coding_system_alist = Qnil;
8014
8015   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8016                doc: /* Coding system to use with system messages.
8017 Also used for decoding keyboard input on X Window system.  */);
8018   Vlocale_coding_system = Qnil;
8019
8020   /* The eol mnemonics are reset in startup.el system-dependently.  */
8021   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8022                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8023   eol_mnemonic_unix = build_string (":");
8024
8025   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8026                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8027   eol_mnemonic_dos = build_string ("\\");
8028
8029   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8030                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8031   eol_mnemonic_mac = build_string ("/");
8032
8033   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8034                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8035   eol_mnemonic_undecided = build_string (":");
8036
8037   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8038                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8039   Venable_character_translation = Qt;
8040
8041   DEFVAR_LISP ("standard-translation-table-for-decode",
8042                &Vstandard_translation_table_for_decode,
8043                doc: /* Table for translating characters while decoding.  */);
8044   Vstandard_translation_table_for_decode = Qnil;
8045
8046   DEFVAR_LISP ("standard-translation-table-for-encode",
8047                &Vstandard_translation_table_for_encode,
8048                doc: /* Table for translating characters while encoding.  */);
8049   Vstandard_translation_table_for_encode = Qnil;
8050
8051   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8052                doc: /* Alist of charsets vs revision numbers.
8053 While encoding, if a charset (car part of an element) is found,
8054 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8055   Vcharset_revision_alist = Qnil;
8056
8057   DEFVAR_LISP ("default-process-coding-system",
8058                &Vdefault_process_coding_system,
8059                doc: /* Cons of coding systems used for process I/O by default.
8060 The car part is used for decoding a process output,
8061 the cdr part is used for encoding a text to be sent to a process.  */);
8062   Vdefault_process_coding_system = Qnil;
8063
8064   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8065                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8066 This is a vector of length 256.
8067 If Nth element is non-nil, the existence of code N in a file
8068 \(or output of subprocess) doesn't prevent it to be detected as
8069 a coding system of ISO 2022 variant which has a flag
8070 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8071 or reading output of a subprocess.
8072 Only 128th through 159th elements has a meaning.  */);
8073   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8074
8075   DEFVAR_LISP ("select-safe-coding-system-function",
8076                &Vselect_safe_coding_system_function,
8077                doc: /* Function to call to select safe coding system for encoding a text.
8078
8079 If set, this function is called to force a user to select a proper
8080 coding system which can encode the text in the case that a default
8081 coding system used in each operation can't encode the text.
8082
8083 The default value is `select-safe-coding-system' (which see).  */);
8084   Vselect_safe_coding_system_function = Qnil;
8085
8086   DEFVAR_BOOL ("coding-system-require-warning",
8087                &coding_system_require_warning,
8088                doc: /* Internal use only.
8089 If non-nil, on writing a file, `select-safe-coding-system-function' is
8090 called even if `coding-system-for-write' is non-nil.  The command
8091 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8092   coding_system_require_warning = 0;
8093
8094
8095   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8096                &inhibit_iso_escape_detection,
8097                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8098
8099 By default, on reading a file, Emacs tries to detect how the text is
8100 encoded.  This code detection is sensitive to escape sequences.  If
8101 the sequence is valid as ISO2022, the code is determined as one of
8102 the ISO2022 encodings, and the file is decoded by the corresponding
8103 coding system (e.g. `iso-2022-7bit').
8104
8105 However, there may be a case that you want to read escape sequences in
8106 a file as is.  In such a case, you can set this variable to non-nil.
8107 Then, as the code detection ignores any escape sequences, no file is
8108 detected as encoded in some ISO2022 encoding.  The result is that all
8109 escape sequences become visible in a buffer.
8110
8111 The default value is nil, and it is strongly recommended not to change
8112 it.  That is because many Emacs Lisp source files that contain
8113 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8114 in Emacs's distribution, and they won't be decoded correctly on
8115 reading if you suppress escape sequence detection.
8116
8117 The other way to read escape sequences in a file without decoding is
8118 to explicitly specify some coding system that doesn't use ISO2022's
8119 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8120   inhibit_iso_escape_detection = 0;
8121
8122   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8123                doc: /* Char table for translating self-inserting characters.
8124 This is applied to the result of input methods, not their input.  See also
8125 `keyboard-translate-table'.  */);
8126     Vtranslation_table_for_input = Qnil;
8127 }
8128
8129 char *
8130 emacs_strerror (error_number)
8131      int error_number;
8132 {
8133   char *str;
8134
8135   synchronize_system_messages_locale ();
8136   str = strerror (error_number);
8137
8138   if (! NILP (Vlocale_coding_system))
8139     {
8140       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8141                                                       Vlocale_coding_system,
8142                                                       0);
8143       str = (char *) SDATA (dec);
8144     }
8145
8146   return str;
8147 }
8148
8149 #endif /* emacs */
8150
8151 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8152    (do not change this comment) */