src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  */
 116 #if 0
 117 int
 118 detect_coding_emacs_mule (src, src_end)
 119      unsigned char *src, *src_end;
 120 {
 121   ...
 122 }
 123 #endif
 124
 125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 126
 127   These functions decode SRC_BYTES length of unibyte text at SOURCE
 128   encoded in CODING to Emacs' internal format.  The resulting
 129   multibyte text goes to a place pointed to by DESTINATION, the length
 130   of which should not exceed DST_BYTES.
 131
 132   These functions set the information of original and decoded texts in
 133   the members produced, produced_char, consumed, and consumed_char of
 134   the structure *CODING.  They also set the member result to one of
 135   CODING_FINISH_XXX indicating how the decoding finished.
 136
 137   DST_BYTES zero means that source area and destination area are
 138   overlapped, which means that we can produce a decoded text until it
 139   reaches at the head of not-yet-decoded source text.
 140
 141   Below is a template of these functions.  */
 142 #if 0
 143 static void
 144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148 {
 149   ...
 150 }
 151 #endif
 152
 153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 154
 155   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 156   internal multibyte format to CODING.  The resulting unibyte text
 157   goes to a place pointed to by DESTINATION, the length of which
 158   should not exceed DST_BYTES.
 159
 160   These functions set the information of original and encoded texts in
 161   the members produced, produced_char, consumed, and consumed_char of
 162   the structure *CODING.  They also set the member result to one of
 163   CODING_FINISH_XXX indicating how the encoding finished.
 164
 165   DST_BYTES zero means that source area and destination area are
 166   overlapped, which means that we can produce a encoded text until it
 167   reaches at the head of not-yet-encoded source text.
 168
 169   Below is a template of these functions.  */
 170 #if 0
 171 static void
 172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 173      struct coding_system *coding;
 174      unsigned char *source, *destination;
 175      int src_bytes, dst_bytes;
 176 {
 177   ...
 178 }
 179 #endif
 180
 181 /*** COMMONLY USED MACROS ***/
 182
 183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 184    get one, two, and three bytes from the source text respectively.
 185    If there are not enough bytes in the source, they jump to
 186    `label_end_of_loop'.  The caller should set variables `coding',
 187    `src' and `src_end' to appropriate pointer in advance.  These
 188    macros are called from decoding routines `decode_coding_XXX', thus
 189    it is assumed that the source text is unibyte.  */
 190
 191 #define ONE_MORE_BYTE(c1)                                       \
 192   do {                                                          \
 193     if (src >= src_end)                                         \
 194       {                                                         \
 195         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 196         goto label_end_of_loop;                                 \
 197       }                                                         \
 198     c1 = *src++;                                                \
 199   } while (0)
 200
 201 #define TWO_MORE_BYTES(c1, c2)                                  \
 202   do {                                                          \
 203     if (src + 1 >= src_end)                                     \
 204       {                                                         \
 205         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 206         goto label_end_of_loop;                                 \
 207       }                                                         \
 208     c1 = *src++;                                                \
 209     c2 = *src++;                                                \
 210   } while (0)
 211
 212
 213 /* Set C to the next character at the source text pointed by `src'.
 214    If there are not enough characters in the source, jump to
 215    `label_end_of_loop'.  The caller should set variables `coding'
 216    `src', `src_end', and `translation_table' to appropriate pointers
 217    in advance.  This macro is used in encoding routines
 218    `encode_coding_XXX', thus it assumes that the source text is in
 219    multibyte form except for 8-bit characters.  8-bit characters are
 220    in multibyte form if coding->src_multibyte is nonzero, else they
 221    are represented by a single byte.  */
 222
 223 #define ONE_MORE_CHAR(c)                                        \
 224   do {                                                          \
 225     int len = src_end - src;                                    \
 226     int bytes;                                                  \
 227     if (len <= 0)                                               \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         goto label_end_of_loop;                                 \
 231       }                                                         \
 232     if (coding->src_multibyte                                   \
 233         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 234       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 235     else                                                        \
 236       c = *src, bytes = 1;                                      \
 237     if (!NILP (translation_table))                              \
 238       c = translate_char (translation_table, c, -1, 0, 0);      \
 239     src += bytes;                                               \
 240   } while (0)
 241
 242
 243 /* Produce a multibyte form of characater C to `dst'.  Jump to
 244    `label_end_of_loop' if there's not enough space at `dst'.
 245
 246    If we are now in the middle of composition sequence, the decoded
 247    character may be ALTCHAR (for the current composition).  In that
 248    case, the character goes to coding->cmp_data->data instead of
 249    `dst'.
 250
 251    This macro is used in decoding routines.  */
 252
 253 #define EMIT_CHAR(c)                                                    \
 254   do {                                                                  \
 255     if (! COMPOSING_P (coding)                                          \
 256         || coding->composing == COMPOSITION_RELATIVE                    \
 257         || coding->composing == COMPOSITION_WITH_RULE)                  \
 258       {                                                                 \
 259         int bytes = CHAR_BYTES (c);                                     \
 260         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 261           {                                                             \
 262             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 263             goto label_end_of_loop;                                     \
 264           }                                                             \
 265         dst += CHAR_STRING (c, dst);                                    \
 266         coding->produced_char++;                                        \
 267       }                                                                 \
 268                                                                         \
 269     if (COMPOSING_P (coding)                                            \
 270         && coding->composing != COMPOSITION_RELATIVE)                   \
 271       {                                                                 \
 272         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 273         coding->composition_rule_follows                                \
 274           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 275       }                                                                 \
 276   } while (0)
 277
 278
 279 #define EMIT_ONE_BYTE(c)                                        \
 280   do {                                                          \
 281     if (dst >= (dst_bytes ? dst_end : src))                     \
 282       {                                                         \
 283         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 284         goto label_end_of_loop;                                 \
 285       }                                                         \
 286     *dst++ = c;                                                 \
 287   } while (0)
 288
 289 #define EMIT_TWO_BYTES(c1, c2)                                  \
 290   do {                                                          \
 291     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 292       {                                                         \
 293         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 294         goto label_end_of_loop;                                 \
 295       }                                                         \
 296     *dst++ = c1, *dst++ = c2;                                   \
 297   } while (0)
 298
 299 #define EMIT_BYTES(from, to)                                    \
 300   do {                                                          \
 301     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     while (from < to)                                           \
 307       *dst++ = *from++;                                         \
 308   } while (0)
 309
 310 \f
 311 /*** 1. Preamble ***/
 312
 313 #ifdef emacs
 314 #include <config.h>
 315 #endif
 316
 317 #include <stdio.h>
 318
 319 #ifdef emacs
 320
 321 #include "lisp.h"
 322 #include "buffer.h"
 323 #include "charset.h"
 324 #include "composite.h"
 325 #include "ccl.h"
 326 #include "coding.h"
 327 #include "window.h"
 328
 329 #else  /* not emacs */
 330
 331 #include "mulelib.h"
 332
 333 #endif /* not emacs */
 334
 335 Lisp_Object Qcoding_system, Qeol_type;
 336 Lisp_Object Qbuffer_file_coding_system;
 337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 338 Lisp_Object Qno_conversion, Qundecided;
 339 Lisp_Object Qcoding_system_history;
 340 Lisp_Object Qsafe_chars;
 341 Lisp_Object Qvalid_codes;
 342
 343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 345 Lisp_Object Qstart_process, Qopen_network_stream;
 346 Lisp_Object Qtarget_idx;
 347
 348 Lisp_Object Vselect_safe_coding_system_function;
 349
 350 /* Mnemonic string for each format of end-of-line.  */
 351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 352 /* Mnemonic string to indicate format of end-of-line is not yet
 353    decided.  */
 354 Lisp_Object eol_mnemonic_undecided;
 355
 356 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 357    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 358 int system_eol_type;
 359
 360 #ifdef emacs
 361
 362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 363
 364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 365
 366 /* Coding system emacs-mule and raw-text are for converting only
 367    end-of-line format.  */
 368 Lisp_Object Qemacs_mule, Qraw_text;
 369
 370 /* Coding-systems are handed between Emacs Lisp programs and C internal
 371    routines by the following three variables.  */
 372 /* Coding-system for reading files and receiving data from process.  */
 373 Lisp_Object Vcoding_system_for_read;
 374 /* Coding-system for writing files and sending data to process.  */
 375 Lisp_Object Vcoding_system_for_write;
 376 /* Coding-system actually used in the latest I/O.  */
 377 Lisp_Object Vlast_coding_system_used;
 378
 379 /* A vector of length 256 which contains information about special
 380    Latin codes (especially for dealing with Microsoft codes).  */
 381 Lisp_Object Vlatin_extra_code_table;
 382
 383 /* Flag to inhibit code conversion of end-of-line format.  */
 384 int inhibit_eol_conversion;
 385
 386 /* Flag to inhibit ISO2022 escape sequence detection.  */
 387 int inhibit_iso_escape_detection;
 388
 389 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 390 int inherit_process_coding_system;
 391
 392 /* Coding system to be used to encode text for terminal display.  */
 393 struct coding_system terminal_coding;
 394
 395 /* Coding system to be used to encode text for terminal display when
 396    terminal coding system is nil.  */
 397 struct coding_system safe_terminal_coding;
 398
 399 /* Coding system of what is sent from terminal keyboard.  */
 400 struct coding_system keyboard_coding;
 401
 402 /* Default coding system to be used to write a file.  */
 403 struct coding_system default_buffer_file_coding;
 404
 405 Lisp_Object Vfile_coding_system_alist;
 406 Lisp_Object Vprocess_coding_system_alist;
 407 Lisp_Object Vnetwork_coding_system_alist;
 408
 409 Lisp_Object Vlocale_coding_system;
 410
 411 #endif /* emacs */
 412
 413 Lisp_Object Qcoding_category, Qcoding_category_index;
 414
 415 /* List of symbols `coding-category-xxx' ordered by priority.  */
 416 Lisp_Object Vcoding_category_list;
 417
 418 /* Table of coding categories (Lisp symbols).  */
 419 Lisp_Object Vcoding_category_table;
 420
 421 /* Table of names of symbol for each coding-category.  */
 422 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 423   "coding-category-emacs-mule",
 424   "coding-category-sjis",
 425   "coding-category-iso-7",
 426   "coding-category-iso-7-tight",
 427   "coding-category-iso-8-1",
 428   "coding-category-iso-8-2",
 429   "coding-category-iso-7-else",
 430   "coding-category-iso-8-else",
 431   "coding-category-ccl",
 432   "coding-category-big5",
 433   "coding-category-utf-8",
 434   "coding-category-utf-16-be",
 435   "coding-category-utf-16-le",
 436   "coding-category-raw-text",
 437   "coding-category-binary"
 438 };
 439
 440 /* Table of pointers to coding systems corresponding to each coding
 441    categories.  */
 442 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 443
 444 /* Table of coding category masks.  Nth element is a mask for a coding
 445    cateogry of which priority is Nth.  */
 446 static
 447 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 448
 449 /* Flag to tell if we look up translation table on character code
 450    conversion.  */
 451 Lisp_Object Venable_character_translation;
 452 /* Standard translation table to look up on decoding (reading).  */
 453 Lisp_Object Vstandard_translation_table_for_decode;
 454 /* Standard translation table to look up on encoding (writing).  */
 455 Lisp_Object Vstandard_translation_table_for_encode;
 456
 457 Lisp_Object Qtranslation_table;
 458 Lisp_Object Qtranslation_table_id;
 459 Lisp_Object Qtranslation_table_for_decode;
 460 Lisp_Object Qtranslation_table_for_encode;
 461
 462 /* Alist of charsets vs revision number.  */
 463 Lisp_Object Vcharset_revision_alist;
 464
 465 /* Default coding systems used for process I/O.  */
 466 Lisp_Object Vdefault_process_coding_system;
 467
 468 /* Global flag to tell that we can't call post-read-conversion and
 469    pre-write-conversion functions.  Usually the value is zero, but it
 470    is set to 1 temporarily while such functions are running.  This is
 471    to avoid infinite recursive call.  */
 472 static int inhibit_pre_post_conversion;
 473
 474 /* Char-table containing safe coding systems of each character.  */
 475 Lisp_Object Vchar_coding_system_table;
 476 Lisp_Object Qchar_coding_system;
 477
 478 /* Return `safe-chars' property of coding system CODING.  Don't check
 479    validity of CODING.  */
 480
 481 Lisp_Object
 482 coding_safe_chars (coding)
 483      struct coding_system *coding;
 484 {
 485   Lisp_Object coding_spec, plist, safe_chars;
 486
 487   coding_spec = Fget (coding->symbol, Qcoding_system);
 488   plist = XVECTOR (coding_spec)->contents[3];
 489   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 490   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 491 }
 492
 493 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 494   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 495
 496 \f
 497 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 498
 499 /* Emacs' internal format for encoding multiple character sets is a
 500    kind of multi-byte encoding, i.e. characters are encoded by
 501    variable-length sequences of one-byte codes.
 502
 503    ASCII characters and control characters (e.g. `tab', `newline') are
 504    represented by one-byte sequences which are their ASCII codes, in
 505    the range 0x00 through 0x7F.
 506
 507    8-bit characters of the range 0x80..0x9F are represented by
 508    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 509    code + 0x20).
 510
 511    8-bit characters of the range 0xA0..0xFF are represented by
 512    one-byte sequences which are their 8-bit code.
 513
 514    The other characters are represented by a sequence of `base
 515    leading-code', optional `extended leading-code', and one or two
 516    `position-code's.  The length of the sequence is determined by the
 517    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 518    whereas extended leading-code and position-code take the range 0xA0
 519    through 0xFF.  See `charset.h' for more details about leading-code
 520    and position-code.
 521
 522    --- CODE RANGE of Emacs' internal format ---
 523    character set        range
 524    -------------        -----
 525    ascii                0x00..0x7F
 526    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 527    eight-bit-graphic    0xA0..0xBF
 528    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 529    ---------------------------------------------
 530
 531   */
 532
 533 enum emacs_code_class_type emacs_code_class[256];
 534
 535 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 536    Check if a text is encoded in Emacs' internal format.  If it is,
 537    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 538
 539 int
 540 detect_coding_emacs_mule (src, src_end)
 541       unsigned char *src, *src_end;
 542 {
 543   unsigned char c;
 544   int composing = 0;
 545   /* Dummy for ONE_MORE_BYTE.  */
 546   struct coding_system dummy_coding;
 547   struct coding_system *coding = &dummy_coding;
 548
 549   while (1)
 550     {
 551       ONE_MORE_BYTE (c);
 552
 553       if (composing)
 554         {
 555           if (c < 0xA0)
 556             composing = 0;
 557           else if (c == 0xA0)
 558             {
 559               ONE_MORE_BYTE (c);
 560               c &= 0x7F;
 561             }
 562           else
 563             c -= 0x20;
 564         }
 565
 566       if (c < 0x20)
 567         {
 568           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 569             return 0;
 570         }
 571       else if (c >= 0x80 && c < 0xA0)
 572         {
 573           if (c == 0x80)
 574             /* Old leading code for a composite character.  */
 575             composing = 1;
 576           else
 577             {
 578               unsigned char *src_base = src - 1;
 579               int bytes;
 580
 581               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 582                                                bytes))
 583                 return 0;
 584               src = src_base + bytes;
 585             }
 586         }
 587     }
 588  label_end_of_loop:
 589   return CODING_CATEGORY_MASK_EMACS_MULE;
 590 }
 591
 592
 593 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 594
 595 static void
 596 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 597      struct coding_system *coding;
 598      unsigned char *source, *destination;
 599      int src_bytes, dst_bytes;
 600 {
 601   unsigned char *src = source;
 602   unsigned char *src_end = source + src_bytes;
 603   unsigned char *dst = destination;
 604   unsigned char *dst_end = destination + dst_bytes;
 605   /* SRC_BASE remembers the start position in source in each loop.
 606      The loop will be exited when there's not enough source code, or
 607      when there's not enough destination area to produce a
 608      character.  */
 609   unsigned char *src_base;
 610
 611   coding->produced_char = 0;
 612   while ((src_base = src) < src_end)
 613     {
 614       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 615       int bytes;
 616
 617       if (*src == '\r')
 618         {
 619           int c = *src++;
 620
 621           if (coding->eol_type == CODING_EOL_CR)
 622             c = '\n';
 623           else if (coding->eol_type == CODING_EOL_CRLF)
 624             {
 625               ONE_MORE_BYTE (c);
 626               if (c != '\n')
 627                 {
 628                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 629                     {
 630                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
 631                       goto label_end_of_loop;
 632                     }
 633                   src--;
 634                   c = '\r';
 635                 }
 636             }
 637           *dst++ = c;
 638           coding->produced_char++;
 639           continue;
 640         }
 641       else if (*src == '\n')
 642         {
 643           if ((coding->eol_type == CODING_EOL_CR
 644                || coding->eol_type == CODING_EOL_CRLF)
 645               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 646             {
 647               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 648               goto label_end_of_loop;
 649             }
 650           *dst++ = *src++;
 651           coding->produced_char++;
 652           continue;
 653         }
 654       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 655         {
 656           p = src;
 657           src += bytes;
 658         }
 659       else
 660         {
 661           bytes = CHAR_STRING (*src, tmp);
 662           p = tmp;
 663           src++;
 664         }
 665       if (dst + bytes >= (dst_bytes ? dst_end : src))
 666         {
 667           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 668           break;
 669         }
 670       while (bytes--) *dst++ = *p++;
 671       coding->produced_char++;
 672     }
 673  label_end_of_loop:
 674   coding->consumed = coding->consumed_char = src_base - source;
 675   coding->produced = dst - destination;
 676 }
 677
 678 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 679   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 680
 681
 682 \f
 683 /*** 3. ISO2022 handlers ***/
 684
 685 /* The following note describes the coding system ISO2022 briefly.
 686    Since the intention of this note is to help understand the
 687    functions in this file, some parts are NOT ACCURATE or OVERLY
 688    SIMPLIFIED.  For thorough understanding, please refer to the
 689    original document of ISO2022.
 690
 691    ISO2022 provides many mechanisms to encode several character sets
 692    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 693    is encoded using bytes less than 128.  This may make the encoded
 694    text a little bit longer, but the text passes more easily through
 695    several gateways, some of which strip off MSB (Most Signigant Bit).
 696
 697    There are two kinds of character sets: control character set and
 698    graphic character set.  The former contains control characters such
 699    as `newline' and `escape' to provide control functions (control
 700    functions are also provided by escape sequences).  The latter
 701    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 702    two control character sets and many graphic character sets.
 703
 704    Graphic character sets are classified into one of the following
 705    four classes, according to the number of bytes (DIMENSION) and
 706    number of characters in one dimension (CHARS) of the set:
 707    - DIMENSION1_CHARS94
 708    - DIMENSION1_CHARS96
 709    - DIMENSION2_CHARS94
 710    - DIMENSION2_CHARS96
 711
 712    In addition, each character set is assigned an identification tag,
 713    unique for each set, called "final character" (denoted as <F>
 714    hereafter).  The <F> of each character set is decided by ECMA(*)
 715    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 716    (0x30..0x3F are for private use only).
 717
 718    Note (*): ECMA = European Computer Manufacturers Association
 719
 720    Here are examples of graphic character set [NAME(<F>)]:
 721         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 722         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 723         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 724         o DIMENSION2_CHARS96 -- none for the moment
 725
 726    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 727         C0 [0x00..0x1F] -- control character plane 0
 728         GL [0x20..0x7F] -- graphic character plane 0
 729         C1 [0x80..0x9F] -- control character plane 1
 730         GR [0xA0..0xFF] -- graphic character plane 1
 731
 732    A control character set is directly designated and invoked to C0 or
 733    C1 by an escape sequence.  The most common case is that:
 734    - ISO646's  control character set is designated/invoked to C0, and
 735    - ISO6429's control character set is designated/invoked to C1,
 736    and usually these designations/invocations are omitted in encoded
 737    text.  In a 7-bit environment, only C0 can be used, and a control
 738    character for C1 is encoded by an appropriate escape sequence to
 739    fit into the environment.  All control characters for C1 are
 740    defined to have corresponding escape sequences.
 741
 742    A graphic character set is at first designated to one of four
 743    graphic registers (G0 through G3), then these graphic registers are
 744    invoked to GL or GR.  These designations and invocations can be
 745    done independently.  The most common case is that G0 is invoked to
 746    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 747    these invocations and designations are omitted in encoded text.
 748    In a 7-bit environment, only GL can be used.
 749
 750    When a graphic character set of CHARS94 is invoked to GL, codes
 751    0x20 and 0x7F of the GL area work as control characters SPACE and
 752    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 753    be used.
 754
 755    There are two ways of invocation: locking-shift and single-shift.
 756    With locking-shift, the invocation lasts until the next different
 757    invocation, whereas with single-shift, the invocation affects the
 758    following character only and doesn't affect the locking-shift
 759    state.  Invocations are done by the following control characters or
 760    escape sequences:
 761
 762    ----------------------------------------------------------------------
 763    abbrev  function                  cntrl escape seq   description
 764    ----------------------------------------------------------------------
 765    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 766    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 767    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 768    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 769    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 770    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 771    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 772    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 773    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 774    ----------------------------------------------------------------------
 775    (*) These are not used by any known coding system.
 776
 777    Control characters for these functions are defined by macros
 778    ISO_CODE_XXX in `coding.h'.
 779
 780    Designations are done by the following escape sequences:
 781    ----------------------------------------------------------------------
 782    escape sequence      description
 783    ----------------------------------------------------------------------
 784    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 785    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 786    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 787    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 788    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 789    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 790    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 791    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 792    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 793    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 794    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 795    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 796    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 797    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 798    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 799    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 800    ----------------------------------------------------------------------
 801
 802    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 803    of dimension 1, chars 94, and final character <F>, etc...
 804
 805    Note (*): Although these designations are not allowed in ISO2022,
 806    Emacs accepts them on decoding, and produces them on encoding
 807    CHARS96 character sets in a coding system which is characterized as
 808    7-bit environment, non-locking-shift, and non-single-shift.
 809
 810    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 811    '(' can be omitted.  We refer to this as "short-form" hereafter.
 812
 813    Now you may notice that there are a lot of ways for encoding the
 814    same multilingual text in ISO2022.  Actually, there exist many
 815    coding systems such as Compound Text (used in X11's inter client
 816    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 817    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 818    localized platforms), and all of these are variants of ISO2022.
 819
 820    In addition to the above, Emacs handles two more kinds of escape
 821    sequences: ISO6429's direction specification and Emacs' private
 822    sequence for specifying character composition.
 823
 824    ISO6429's direction specification takes the following form:
 825         o CSI ']'      -- end of the current direction
 826         o CSI '0' ']'  -- end of the current direction
 827         o CSI '1' ']'  -- start of left-to-right text
 828         o CSI '2' ']'  -- start of right-to-left text
 829    The control character CSI (0x9B: control sequence introducer) is
 830    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 831
 832    Character composition specification takes the following form:
 833         o ESC '0' -- start relative composition
 834         o ESC '1' -- end composition
 835         o ESC '2' -- start rule-base composition (*)
 836         o ESC '3' -- start relative composition with alternate chars  (**)
 837         o ESC '4' -- start rule-base composition with alternate chars  (**)
 838   Since these are not standard escape sequences of any ISO standard,
 839   the use of them for these meaning is restricted to Emacs only.
 840
 841   (*) This form is used only in Emacs 20.5 and the older versions,
 842   but the newer versions can safely decode it.
 843   (**) This form is used only in Emacs 21.1 and the newer versions,
 844   and the older versions can't decode it.
 845
 846   Here's a list of examples usages of these composition escape
 847   sequences (categorized by `enum composition_method').
 848
 849   COMPOSITION_RELATIVE:
 850         ESC 0 CHAR [ CHAR ] ESC 1
 851   COMPOSITOIN_WITH_RULE:
 852         ESC 2 CHAR [ RULE CHAR ] ESC 1
 853   COMPOSITION_WITH_ALTCHARS:
 854         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 855   COMPOSITION_WITH_RULE_ALTCHARS:
 856         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 857
 858 enum iso_code_class_type iso_code_class[256];
 859
 860 #define CHARSET_OK(idx, charset, c)                                     \
 861   (coding_system_table[idx]                                             \
 862    && (charset == CHARSET_ASCII                                         \
 863        || (safe_chars = coding_safe_chars (coding_system_table[idx]),   \
 864            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
 865    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
 866                                               charset)                  \
 867        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
 868
 869 #define SHIFT_OUT_OK(idx) \
 870   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 871
 872 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 873    Check if a text is encoded in ISO2022.  If it is, returns an
 874    integer in which appropriate flag bits any of:
 875         CODING_CATEGORY_MASK_ISO_7
 876         CODING_CATEGORY_MASK_ISO_7_TIGHT
 877         CODING_CATEGORY_MASK_ISO_8_1
 878         CODING_CATEGORY_MASK_ISO_8_2
 879         CODING_CATEGORY_MASK_ISO_7_ELSE
 880         CODING_CATEGORY_MASK_ISO_8_ELSE
 881    are set.  If a code which should never appear in ISO2022 is found,
 882    returns 0.  */
 883
 884 int
 885 detect_coding_iso2022 (src, src_end)
 886      unsigned char *src, *src_end;
 887 {
 888   int mask = CODING_CATEGORY_MASK_ISO;
 889   int mask_found = 0;
 890   int reg[4], shift_out = 0, single_shifting = 0;
 891   int c, c1, i, charset;
 892   /* Dummy for ONE_MORE_BYTE.  */
 893   struct coding_system dummy_coding;
 894   struct coding_system *coding = &dummy_coding;
 895   Lisp_Object safe_chars;
 896
 897   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 898   while (mask && src < src_end)
 899     {
 900       ONE_MORE_BYTE (c);
 901       switch (c)
 902         {
 903         case ISO_CODE_ESC:
 904           if (inhibit_iso_escape_detection)
 905             break;
 906           single_shifting = 0;
 907           ONE_MORE_BYTE (c);
 908           if (c >= '(' && c <= '/')
 909             {
 910               /* Designation sequence for a charset of dimension 1.  */
 911               ONE_MORE_BYTE (c1);
 912               if (c1 < ' ' || c1 >= 0x80
 913                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 914                 /* Invalid designation sequence.  Just ignore.  */
 915                 break;
 916               reg[(c - '(') % 4] = charset;
 917             }
 918           else if (c == '$')
 919             {
 920               /* Designation sequence for a charset of dimension 2.  */
 921               ONE_MORE_BYTE (c);
 922               if (c >= '@' && c <= 'B')
 923                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 924                 reg[0] = charset = iso_charset_table[1][0][c];
 925               else if (c >= '(' && c <= '/')
 926                 {
 927                   ONE_MORE_BYTE (c1);
 928                   if (c1 < ' ' || c1 >= 0x80
 929                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 930                     /* Invalid designation sequence.  Just ignore.  */
 931                     break;
 932                   reg[(c - '(') % 4] = charset;
 933                 }
 934               else
 935                 /* Invalid designation sequence.  Just ignore.  */
 936                 break;
 937             }
 938           else if (c == 'N' || c == 'O')
 939             {
 940               /* ESC <Fe> for SS2 or SS3.  */
 941               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 942               break;
 943             }
 944           else if (c >= '0' && c <= '4')
 945             {
 946               /* ESC <Fp> for start/end composition.  */
 947               mask_found |= CODING_CATEGORY_MASK_ISO;
 948               break;
 949             }
 950           else
 951             /* Invalid escape sequence.  Just ignore.  */
 952             break;
 953
 954           /* We found a valid designation sequence for CHARSET.  */
 955           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 956           c = MAKE_CHAR (charset, 0, 0);
 957           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
 958             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 959           else
 960             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 961           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
 962             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 963           else
 964             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 965           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
 966             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 967           else
 968             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 969           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
 970             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 971           else
 972             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 973           break;
 974
 975         case ISO_CODE_SO:
 976           if (inhibit_iso_escape_detection)
 977             break;
 978           single_shifting = 0;
 979           if (shift_out == 0
 980               && (reg[1] >= 0
 981                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 982                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 983             {
 984               /* Locking shift out.  */
 985               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 986               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 987             }
 988           break;
 989
 990         case ISO_CODE_SI:
 991           if (inhibit_iso_escape_detection)
 992             break;
 993           single_shifting = 0;
 994           if (shift_out == 1)
 995             {
 996               /* Locking shift in.  */
 997               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 998               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 999             }
1000           break;
1001
1002         case ISO_CODE_CSI:
1003           single_shifting = 0;
1004         case ISO_CODE_SS2:
1005         case ISO_CODE_SS3:
1006           {
1007             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1008
1009             if (inhibit_iso_escape_detection)
1010               break;
1011             if (c != ISO_CODE_CSI)
1012               {
1013                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1014                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1015                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1016                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1017                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1018                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1019                 single_shifting = 1;
1020               }
1021             if (VECTORP (Vlatin_extra_code_table)
1022                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1023               {
1024                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1025                     & CODING_FLAG_ISO_LATIN_EXTRA)
1026                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1027                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1028                     & CODING_FLAG_ISO_LATIN_EXTRA)
1029                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1030               }
1031             mask &= newmask;
1032             mask_found |= newmask;
1033           }
1034           break;
1035
1036         default:
1037           if (c < 0x80)
1038             {
1039               single_shifting = 0;
1040               break;
1041             }
1042           else if (c < 0xA0)
1043             {
1044               single_shifting = 0;
1045               if (VECTORP (Vlatin_extra_code_table)
1046                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1047                 {
1048                   int newmask = 0;
1049
1050                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1051                       & CODING_FLAG_ISO_LATIN_EXTRA)
1052                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1053                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1054                       & CODING_FLAG_ISO_LATIN_EXTRA)
1055                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1056                   mask &= newmask;
1057                   mask_found |= newmask;
1058                 }
1059               else
1060                 return 0;
1061             }
1062           else
1063             {
1064               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1065                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1066               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1067               /* Check the length of succeeding codes of the range
1068                  0xA0..0FF.  If the byte length is odd, we exclude
1069                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1070                  when we are not single shifting.  */
1071               if (!single_shifting
1072                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1073                 {
1074                   int i = 1;
1075                   while (src < src_end)
1076                     {
1077                       ONE_MORE_BYTE (c);
1078                       if (c < 0xA0)
1079                         break;
1080                       i++;
1081                     }
1082
1083                   if (i & 1 && src < src_end)
1084                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1085                   else
1086                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1087                 }
1088             }
1089           break;
1090         }
1091     }
1092  label_end_of_loop:
1093   return (mask & mask_found);
1094 }
1095
1096 /* Decode a character of which charset is CHARSET, the 1st position
1097    code is C1, the 2nd position code is C2, and return the decoded
1098    character code.  If the variable `translation_table' is non-nil,
1099    returned the translated code.  */
1100
1101 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1102   (NILP (translation_table)                     \
1103    ? MAKE_CHAR (charset, c1, c2)                \
1104    : translate_char (translation_table, -1, charset, c1, c2))
1105
1106 /* Set designation state into CODING.  */
1107 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1108   do {                                                                     \
1109     int charset, c;                                                        \
1110                                                                            \
1111     if (final_char < '0' || final_char >= 128)                             \
1112       goto label_invalid_code;                                             \
1113     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1114                                  make_number (chars),                      \
1115                                  make_number (final_char));                \
1116     c = MAKE_CHAR (charset, 0, 0);                                         \
1117     if (charset >= 0                                                       \
1118         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1119             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1120       {                                                                    \
1121         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1122             && reg == 0                                                    \
1123             && charset == CHARSET_ASCII)                                   \
1124           {                                                                \
1125             /* We should insert this designation sequence as is so         \
1126                that it is surely written back to a file.  */               \
1127             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1128             goto label_invalid_code;                                       \
1129           }                                                                \
1130         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1131         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1132             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1133           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1134         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1135       }                                                                    \
1136     else                                                                   \
1137       {                                                                    \
1138         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1139         goto label_invalid_code;                                           \
1140       }                                                                    \
1141   } while (0)
1142
1143 /* Allocate a memory block for storing information about compositions.
1144    The block is chained to the already allocated blocks.  */
1145
1146 void
1147 coding_allocate_composition_data (coding, char_offset)
1148      struct coding_system *coding;
1149      int char_offset;
1150 {
1151   struct composition_data *cmp_data
1152     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1153
1154   cmp_data->char_offset = char_offset;
1155   cmp_data->used = 0;
1156   cmp_data->prev = coding->cmp_data;
1157   cmp_data->next = NULL;
1158   if (coding->cmp_data)
1159     coding->cmp_data->next = cmp_data;
1160   coding->cmp_data = cmp_data;
1161   coding->cmp_data_start = 0;
1162 }
1163
1164 /* Record the starting position START and METHOD of one composition.  */
1165
1166 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1167   do {                                                          \
1168     struct composition_data *cmp_data = coding->cmp_data;       \
1169     int *data = cmp_data->data + cmp_data->used;                \
1170     coding->cmp_data_start = cmp_data->used;                    \
1171     data[0] = -1;                                               \
1172     data[1] = cmp_data->char_offset + start;                    \
1173     data[3] = (int) method;                                     \
1174     cmp_data->used += 4;                                        \
1175   } while (0)
1176
1177 /* Record the ending position END of the current composition.  */
1178
1179 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1180   do {                                                          \
1181     struct composition_data *cmp_data = coding->cmp_data;       \
1182     int *data = cmp_data->data + coding->cmp_data_start;        \
1183     data[0] = cmp_data->used - coding->cmp_data_start;          \
1184     data[2] = cmp_data->char_offset + end;                      \
1185   } while (0)
1186
1187 /* Record one COMPONENT (alternate character or composition rule).  */
1188
1189 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1190   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1191
1192 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1193
1194 #define DECODE_COMPOSITION_START(c1)                                       \
1195   do {                                                                     \
1196     if (coding->composing == COMPOSITION_DISABLED)                         \
1197       {                                                                    \
1198         *dst++ = ISO_CODE_ESC;                                             \
1199         *dst++ = c1 & 0x7f;                                                \
1200         coding->produced_char += 2;                                        \
1201       }                                                                    \
1202     else if (!COMPOSING_P (coding))                                        \
1203       {                                                                    \
1204         /* This is surely the start of a composition.  We must be sure     \
1205            that coding->cmp_data has enough space to store the             \
1206            information about the composition.  If not, terminate the       \
1207            current decoding loop, allocate one more memory block for       \
1208            coding->cmp_data in the calller, then start the decoding        \
1209            loop again.  We can't allocate memory here directly because     \
1210            it may cause buffer/string relocation.  */                      \
1211         if (!coding->cmp_data                                              \
1212             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1213                 >= COMPOSITION_DATA_SIZE))                                 \
1214           {                                                                \
1215             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1216             goto label_end_of_loop;                                        \
1217           }                                                                \
1218         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1219                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1220                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1221                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1222         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1223                                       coding->composing);                  \
1224         coding->composition_rule_follows = 0;                              \
1225       }                                                                    \
1226     else                                                                   \
1227       {                                                                    \
1228         /* We are already handling a composition.  If the method is        \
1229            the following two, the codes following the current escape       \
1230            sequence are actual characters stored in a buffer.  */          \
1231         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1232             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1233           {                                                                \
1234             coding->composing = COMPOSITION_RELATIVE;                      \
1235             coding->composition_rule_follows = 0;                          \
1236           }                                                                \
1237       }                                                                    \
1238   } while (0)
1239
1240 /* Handle compositoin end sequence ESC 1.  */
1241
1242 #define DECODE_COMPOSITION_END(c1)                                      \
1243   do {                                                                  \
1244     if (coding->composing == COMPOSITION_DISABLED)                      \
1245       {                                                                 \
1246         *dst++ = ISO_CODE_ESC;                                          \
1247         *dst++ = c1;                                                    \
1248         coding->produced_char += 2;                                     \
1249       }                                                                 \
1250     else                                                                \
1251       {                                                                 \
1252         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1253         coding->composing = COMPOSITION_NO;                             \
1254       }                                                                 \
1255   } while (0)
1256
1257 /* Decode a composition rule from the byte C1 (and maybe one more byte
1258    from SRC) and store one encoded composition rule in
1259    coding->cmp_data.  */
1260
1261 #define DECODE_COMPOSITION_RULE(c1)                                     \
1262   do {                                                                  \
1263     int rule = 0;                                                       \
1264     (c1) -= 32;                                                         \
1265     if (c1 < 81)                /* old format (before ver.21) */        \
1266       {                                                                 \
1267         int gref = (c1) / 9;                                            \
1268         int nref = (c1) % 9;                                            \
1269         if (gref == 4) gref = 10;                                       \
1270         if (nref == 4) nref = 10;                                       \
1271         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1272       }                                                                 \
1273     else if (c1 < 93)           /* new format (after ver.21) */         \
1274       {                                                                 \
1275         ONE_MORE_BYTE (c2);                                             \
1276         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1277       }                                                                 \
1278     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1279     coding->composition_rule_follows = 0;                               \
1280   } while (0)
1281
1282
1283 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1284
1285 static void
1286 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1287      struct coding_system *coding;
1288      unsigned char *source, *destination;
1289      int src_bytes, dst_bytes;
1290 {
1291   unsigned char *src = source;
1292   unsigned char *src_end = source + src_bytes;
1293   unsigned char *dst = destination;
1294   unsigned char *dst_end = destination + dst_bytes;
1295   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1296   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1297   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1298   /* SRC_BASE remembers the start position in source in each loop.
1299      The loop will be exited when there's not enough source code
1300      (within macro ONE_MORE_BYTE), or when there's not enough
1301      destination area to produce a character (within macro
1302      EMIT_CHAR).  */
1303   unsigned char *src_base;
1304   int c, charset;
1305   Lisp_Object translation_table;
1306   Lisp_Object safe_chars;
1307
1308   safe_chars = coding_safe_chars (coding);
1309
1310   if (NILP (Venable_character_translation))
1311     translation_table = Qnil;
1312   else
1313     {
1314       translation_table = coding->translation_table_for_decode;
1315       if (NILP (translation_table))
1316         translation_table = Vstandard_translation_table_for_decode;
1317     }
1318
1319   coding->result = CODING_FINISH_NORMAL;
1320
1321   while (1)
1322     {
1323       int c1, c2;
1324
1325       src_base = src;
1326       ONE_MORE_BYTE (c1);
1327
1328       /* We produce no character or one character.  */
1329       switch (iso_code_class [c1])
1330         {
1331         case ISO_0x20_or_0x7F:
1332           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1333             {
1334               DECODE_COMPOSITION_RULE (c1);
1335               continue;
1336             }
1337           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1338             {
1339               /* This is SPACE or DEL.  */
1340               charset = CHARSET_ASCII;
1341               break;
1342             }
1343           /* This is a graphic character, we fall down ...  */
1344
1345         case ISO_graphic_plane_0:
1346           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1347             {
1348               DECODE_COMPOSITION_RULE (c1);
1349               continue;
1350             }
1351           charset = charset0;
1352           break;
1353
1354         case ISO_0xA0_or_0xFF:
1355           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1356               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1357             goto label_invalid_code;
1358           /* This is a graphic character, we fall down ... */
1359
1360         case ISO_graphic_plane_1:
1361           if (charset1 < 0)
1362             goto label_invalid_code;
1363           charset = charset1;
1364           break;
1365
1366         case ISO_control_0:
1367           if (COMPOSING_P (coding))
1368             DECODE_COMPOSITION_END ('1');
1369
1370           /* All ISO2022 control characters in this class have the
1371              same representation in Emacs internal format.  */
1372           if (c1 == '\n'
1373               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1374               && (coding->eol_type == CODING_EOL_CR
1375                   || coding->eol_type == CODING_EOL_CRLF))
1376             {
1377               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1378               goto label_end_of_loop;
1379             }
1380           charset = CHARSET_ASCII;
1381           break;
1382
1383         case ISO_control_1:
1384           if (COMPOSING_P (coding))
1385             DECODE_COMPOSITION_END ('1');
1386           goto label_invalid_code;
1387
1388         case ISO_carriage_return:
1389           if (COMPOSING_P (coding))
1390             DECODE_COMPOSITION_END ('1');
1391
1392           if (coding->eol_type == CODING_EOL_CR)
1393             c1 = '\n';
1394           else if (coding->eol_type == CODING_EOL_CRLF)
1395             {
1396               ONE_MORE_BYTE (c1);
1397               if (c1 != ISO_CODE_LF)
1398                 {
1399                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1400                     {
1401                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1402                       goto label_end_of_loop;
1403                     }
1404                   src--;
1405                   c1 = '\r';
1406                 }
1407             }
1408           charset = CHARSET_ASCII;
1409           break;
1410
1411         case ISO_shift_out:
1412           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1413               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1414             goto label_invalid_code;
1415           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1416           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1417           continue;
1418
1419         case ISO_shift_in:
1420           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1421             goto label_invalid_code;
1422           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1423           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1424           continue;
1425
1426         case ISO_single_shift_2_7:
1427         case ISO_single_shift_2:
1428           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1429             goto label_invalid_code;
1430           /* SS2 is handled as an escape sequence of ESC 'N' */
1431           c1 = 'N';
1432           goto label_escape_sequence;
1433
1434         case ISO_single_shift_3:
1435           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1436             goto label_invalid_code;
1437           /* SS2 is handled as an escape sequence of ESC 'O' */
1438           c1 = 'O';
1439           goto label_escape_sequence;
1440
1441         case ISO_control_sequence_introducer:
1442           /* CSI is handled as an escape sequence of ESC '[' ...  */
1443           c1 = '[';
1444           goto label_escape_sequence;
1445
1446         case ISO_escape:
1447           ONE_MORE_BYTE (c1);
1448         label_escape_sequence:
1449           /* Escape sequences handled by Emacs are invocation,
1450              designation, direction specification, and character
1451              composition specification.  */
1452           switch (c1)
1453             {
1454             case '&':           /* revision of following character set */
1455               ONE_MORE_BYTE (c1);
1456               if (!(c1 >= '@' && c1 <= '~'))
1457                 goto label_invalid_code;
1458               ONE_MORE_BYTE (c1);
1459               if (c1 != ISO_CODE_ESC)
1460                 goto label_invalid_code;
1461               ONE_MORE_BYTE (c1);
1462               goto label_escape_sequence;
1463
1464             case '$':           /* designation of 2-byte character set */
1465               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1466                 goto label_invalid_code;
1467               ONE_MORE_BYTE (c1);
1468               if (c1 >= '@' && c1 <= 'B')
1469                 {       /* designation of JISX0208.1978, GB2312.1980,
1470                            or JISX0208.1980 */
1471                   DECODE_DESIGNATION (0, 2, 94, c1);
1472                 }
1473               else if (c1 >= 0x28 && c1 <= 0x2B)
1474                 {       /* designation of DIMENSION2_CHARS94 character set */
1475                   ONE_MORE_BYTE (c2);
1476                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1477                 }
1478               else if (c1 >= 0x2C && c1 <= 0x2F)
1479                 {       /* designation of DIMENSION2_CHARS96 character set */
1480                   ONE_MORE_BYTE (c2);
1481                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1482                 }
1483               else
1484                 goto label_invalid_code;
1485               /* We must update these variables now.  */
1486               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1487               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1488               continue;
1489
1490             case 'n':           /* invocation of locking-shift-2 */
1491               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1492                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1493                 goto label_invalid_code;
1494               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1495               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1496               continue;
1497
1498             case 'o':           /* invocation of locking-shift-3 */
1499               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1500                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1501                 goto label_invalid_code;
1502               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1503               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1504               continue;
1505
1506             case 'N':           /* invocation of single-shift-2 */
1507               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1508                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1509                 goto label_invalid_code;
1510               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1511               ONE_MORE_BYTE (c1);
1512               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1513                 goto label_invalid_code;
1514               break;
1515
1516             case 'O':           /* invocation of single-shift-3 */
1517               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1518                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1519                 goto label_invalid_code;
1520               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1521               ONE_MORE_BYTE (c1);
1522               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1523                 goto label_invalid_code;
1524               break;
1525
1526             case '0': case '2': case '3': case '4': /* start composition */
1527               DECODE_COMPOSITION_START (c1);
1528               continue;
1529
1530             case '1':           /* end composition */
1531               DECODE_COMPOSITION_END (c1);
1532               continue;
1533
1534             case '[':           /* specification of direction */
1535               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1536                 goto label_invalid_code;
1537               /* For the moment, nested direction is not supported.
1538                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1539                  left-to-right, and nozero means right-to-left.  */
1540               ONE_MORE_BYTE (c1);
1541               switch (c1)
1542                 {
1543                 case ']':       /* end of the current direction */
1544                   coding->mode &= ~CODING_MODE_DIRECTION;
1545
1546                 case '0':       /* end of the current direction */
1547                 case '1':       /* start of left-to-right direction */
1548                   ONE_MORE_BYTE (c1);
1549                   if (c1 == ']')
1550                     coding->mode &= ~CODING_MODE_DIRECTION;
1551                   else
1552                     goto label_invalid_code;
1553                   break;
1554
1555                 case '2':       /* start of right-to-left direction */
1556                   ONE_MORE_BYTE (c1);
1557                   if (c1 == ']')
1558                     coding->mode |= CODING_MODE_DIRECTION;
1559                   else
1560                     goto label_invalid_code;
1561                   break;
1562
1563                 default:
1564                   goto label_invalid_code;
1565                 }
1566               continue;
1567
1568             default:
1569               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1570                 goto label_invalid_code;
1571               if (c1 >= 0x28 && c1 <= 0x2B)
1572                 {       /* designation of DIMENSION1_CHARS94 character set */
1573                   ONE_MORE_BYTE (c2);
1574                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1575                 }
1576               else if (c1 >= 0x2C && c1 <= 0x2F)
1577                 {       /* designation of DIMENSION1_CHARS96 character set */
1578                   ONE_MORE_BYTE (c2);
1579                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1580                 }
1581               else
1582                 goto label_invalid_code;
1583               /* We must update these variables now.  */
1584               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1585               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1586               continue;
1587             }
1588         }
1589
1590       /* Now we know CHARSET and 1st position code C1 of a character.
1591          Produce a multibyte sequence for that character while getting
1592          2nd position code C2 if necessary.  */
1593       if (CHARSET_DIMENSION (charset) == 2)
1594         {
1595           ONE_MORE_BYTE (c2);
1596           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1597             /* C2 is not in a valid range.  */
1598             goto label_invalid_code;
1599         }
1600       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1601       EMIT_CHAR (c);
1602       continue;
1603
1604     label_invalid_code:
1605       coding->errors++;
1606       if (COMPOSING_P (coding))
1607         DECODE_COMPOSITION_END ('1');
1608       src = src_base;
1609       c = *src++;
1610       EMIT_CHAR (c);
1611     }
1612
1613  label_end_of_loop:
1614   coding->consumed = coding->consumed_char = src_base - source;
1615   coding->produced = dst - destination;
1616   return;
1617 }
1618
1619
1620 /* ISO2022 encoding stuff.  */
1621
1622 /*
1623    It is not enough to say just "ISO2022" on encoding, we have to
1624    specify more details.  In Emacs, each coding system of ISO2022
1625    variant has the following specifications:
1626         1. Initial designation to G0 thru G3.
1627         2. Allows short-form designation?
1628         3. ASCII should be designated to G0 before control characters?
1629         4. ASCII should be designated to G0 at end of line?
1630         5. 7-bit environment or 8-bit environment?
1631         6. Use locking-shift?
1632         7. Use Single-shift?
1633    And the following two are only for Japanese:
1634         8. Use ASCII in place of JIS0201-1976-Roman?
1635         9. Use JISX0208-1983 in place of JISX0208-1978?
1636    These specifications are encoded in `coding->flags' as flag bits
1637    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1638    details.
1639 */
1640
1641 /* Produce codes (escape sequence) for designating CHARSET to graphic
1642    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1643    '@', 'A', or 'B' and the coding system CODING allows, produce
1644    designation sequence of short-form.  */
1645
1646 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1647   do {                                                                  \
1648     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1649     char *intermediate_char_94 = "()*+";                                \
1650     char *intermediate_char_96 = ",-./";                                \
1651     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1652                                                                         \
1653     if (revision < 255)                                                 \
1654       {                                                                 \
1655         *dst++ = ISO_CODE_ESC;                                          \
1656         *dst++ = '&';                                                   \
1657         *dst++ = '@' + revision;                                        \
1658       }                                                                 \
1659     *dst++ = ISO_CODE_ESC;                                              \
1660     if (CHARSET_DIMENSION (charset) == 1)                               \
1661       {                                                                 \
1662         if (CHARSET_CHARS (charset) == 94)                              \
1663           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1664         else                                                            \
1665           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1666       }                                                                 \
1667     else                                                                \
1668       {                                                                 \
1669         *dst++ = '$';                                                   \
1670         if (CHARSET_CHARS (charset) == 94)                              \
1671           {                                                             \
1672             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1673                 || reg != 0                                             \
1674                 || final_char < '@' || final_char > 'B')                \
1675               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1676           }                                                             \
1677         else                                                            \
1678           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1679       }                                                                 \
1680     *dst++ = final_char;                                                \
1681     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1682   } while (0)
1683
1684 /* The following two macros produce codes (control character or escape
1685    sequence) for ISO2022 single-shift functions (single-shift-2 and
1686    single-shift-3).  */
1687
1688 #define ENCODE_SINGLE_SHIFT_2                           \
1689   do {                                                  \
1690     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1691       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1692     else                                                \
1693       *dst++ = ISO_CODE_SS2;                            \
1694     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1695   } while (0)
1696
1697 #define ENCODE_SINGLE_SHIFT_3                           \
1698   do {                                                  \
1699     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1700       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1701     else                                                \
1702       *dst++ = ISO_CODE_SS3;                            \
1703     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1704   } while (0)
1705
1706 /* The following four macros produce codes (control character or
1707    escape sequence) for ISO2022 locking-shift functions (shift-in,
1708    shift-out, locking-shift-2, and locking-shift-3).  */
1709
1710 #define ENCODE_SHIFT_IN                         \
1711   do {                                          \
1712     *dst++ = ISO_CODE_SI;                       \
1713     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1714   } while (0)
1715
1716 #define ENCODE_SHIFT_OUT                        \
1717   do {                                          \
1718     *dst++ = ISO_CODE_SO;                       \
1719     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1720   } while (0)
1721
1722 #define ENCODE_LOCKING_SHIFT_2                  \
1723   do {                                          \
1724     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1725     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1726   } while (0)
1727
1728 #define ENCODE_LOCKING_SHIFT_3                  \
1729   do {                                          \
1730     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1731     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1732   } while (0)
1733
1734 /* Produce codes for a DIMENSION1 character whose character set is
1735    CHARSET and whose position-code is C1.  Designation and invocation
1736    sequences are also produced in advance if necessary.  */
1737
1738 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1739   do {                                                                  \
1740     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1741       {                                                                 \
1742         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1743           *dst++ = c1 & 0x7F;                                           \
1744         else                                                            \
1745           *dst++ = c1 | 0x80;                                           \
1746         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1747         break;                                                          \
1748       }                                                                 \
1749     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1750       {                                                                 \
1751         *dst++ = c1 & 0x7F;                                             \
1752         break;                                                          \
1753       }                                                                 \
1754     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1755       {                                                                 \
1756         *dst++ = c1 | 0x80;                                             \
1757         break;                                                          \
1758       }                                                                 \
1759     else                                                                \
1760       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1761          must invoke it, or, at first, designate it to some graphic     \
1762          register.  Then repeat the loop to actually produce the        \
1763          character.  */                                                 \
1764       dst = encode_invocation_designation (charset, coding, dst);       \
1765   } while (1)
1766
1767 /* Produce codes for a DIMENSION2 character whose character set is
1768    CHARSET and whose position-codes are C1 and C2.  Designation and
1769    invocation codes are also produced in advance if necessary.  */
1770
1771 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1772   do {                                                                  \
1773     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1774       {                                                                 \
1775         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1776           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1777         else                                                            \
1778           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1779         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1780         break;                                                          \
1781       }                                                                 \
1782     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1783       {                                                                 \
1784         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1785         break;                                                          \
1786       }                                                                 \
1787     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1788       {                                                                 \
1789         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1790         break;                                                          \
1791       }                                                                 \
1792     else                                                                \
1793       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1794          must invoke it, or, at first, designate it to some graphic     \
1795          register.  Then repeat the loop to actually produce the        \
1796          character.  */                                                 \
1797       dst = encode_invocation_designation (charset, coding, dst);       \
1798   } while (1)
1799
1800 #define ENCODE_ISO_CHARACTER(c)                                 \
1801   do {                                                          \
1802     int charset, c1, c2;                                        \
1803                                                                 \
1804     SPLIT_CHAR (c, charset, c1, c2);                            \
1805     if (CHARSET_DEFINED_P (charset))                            \
1806       {                                                         \
1807         if (CHARSET_DIMENSION (charset) == 1)                   \
1808           {                                                     \
1809             if (charset == CHARSET_ASCII                        \
1810                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
1811               charset = charset_latin_jisx0201;                 \
1812             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
1813           }                                                     \
1814         else                                                    \
1815           {                                                     \
1816             if (charset == charset_jisx0208                     \
1817                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
1818               charset = charset_jisx0208_1978;                  \
1819             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
1820           }                                                     \
1821       }                                                         \
1822     else                                                        \
1823       {                                                         \
1824         *dst++ = c1;                                            \
1825         if (c2 >= 0)                                            \
1826           *dst++ = c2;                                          \
1827       }                                                         \
1828   } while (0)
1829
1830
1831 /* Instead of encoding character C, produce one or two `?'s.  */
1832
1833 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
1834   do {                                                                  \
1835     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
1836     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
1837       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
1838   } while (0)
1839
1840
1841 /* Produce designation and invocation codes at a place pointed by DST
1842    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1843    Return new DST.  */
1844
1845 unsigned char *
1846 encode_invocation_designation (charset, coding, dst)
1847      int charset;
1848      struct coding_system *coding;
1849      unsigned char *dst;
1850 {
1851   int reg;                      /* graphic register number */
1852
1853   /* At first, check designations.  */
1854   for (reg = 0; reg < 4; reg++)
1855     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1856       break;
1857
1858   if (reg >= 4)
1859     {
1860       /* CHARSET is not yet designated to any graphic registers.  */
1861       /* At first check the requested designation.  */
1862       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1863       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1864         /* Since CHARSET requests no special designation, designate it
1865            to graphic register 0.  */
1866         reg = 0;
1867
1868       ENCODE_DESIGNATION (charset, reg, coding);
1869     }
1870
1871   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1872       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1873     {
1874       /* Since the graphic register REG is not invoked to any graphic
1875          planes, invoke it to graphic plane 0.  */
1876       switch (reg)
1877         {
1878         case 0:                 /* graphic register 0 */
1879           ENCODE_SHIFT_IN;
1880           break;
1881
1882         case 1:                 /* graphic register 1 */
1883           ENCODE_SHIFT_OUT;
1884           break;
1885
1886         case 2:                 /* graphic register 2 */
1887           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1888             ENCODE_SINGLE_SHIFT_2;
1889           else
1890             ENCODE_LOCKING_SHIFT_2;
1891           break;
1892
1893         case 3:                 /* graphic register 3 */
1894           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1895             ENCODE_SINGLE_SHIFT_3;
1896           else
1897             ENCODE_LOCKING_SHIFT_3;
1898           break;
1899         }
1900     }
1901
1902   return dst;
1903 }
1904
1905 /* Produce 2-byte codes for encoded composition rule RULE.  */
1906
1907 #define ENCODE_COMPOSITION_RULE(rule)           \
1908   do {                                          \
1909     int gref, nref;                             \
1910     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1911     *dst++ = 32 + 81 + gref;                    \
1912     *dst++ = 32 + nref;                         \
1913   } while (0)
1914
1915 /* Produce codes for indicating the start of a composition sequence
1916    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1917    which specify information about the composition.  See the comment
1918    in coding.h for the format of DATA.  */
1919
1920 #define ENCODE_COMPOSITION_START(coding, data)                          \
1921   do {                                                                  \
1922     coding->composing = data[3];                                        \
1923     *dst++ = ISO_CODE_ESC;                                              \
1924     if (coding->composing == COMPOSITION_RELATIVE)                      \
1925       *dst++ = '0';                                                     \
1926     else                                                                \
1927       {                                                                 \
1928         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1929                   ? '3' : '4');                                         \
1930         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1931         coding->composition_rule_follows = 0;                           \
1932       }                                                                 \
1933   } while (0)
1934
1935 /* Produce codes for indicating the end of the current composition.  */
1936
1937 #define ENCODE_COMPOSITION_END(coding, data)                    \
1938   do {                                                          \
1939     *dst++ = ISO_CODE_ESC;                                      \
1940     *dst++ = '1';                                               \
1941     coding->cmp_data_start += data[0];                          \
1942     coding->composing = COMPOSITION_NO;                         \
1943     if (coding->cmp_data_start == coding->cmp_data->used        \
1944         && coding->cmp_data->next)                              \
1945       {                                                         \
1946         coding->cmp_data = coding->cmp_data->next;              \
1947         coding->cmp_data_start = 0;                             \
1948       }                                                         \
1949   } while (0)
1950
1951 /* Produce composition start sequence ESC 0.  Here, this sequence
1952    doesn't mean the start of a new composition but means that we have
1953    just produced components (alternate chars and composition rules) of
1954    the composition and the actual text follows in SRC.  */
1955
1956 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1957   do {                                          \
1958     *dst++ = ISO_CODE_ESC;                      \
1959     *dst++ = '0';                               \
1960     coding->composing = COMPOSITION_RELATIVE;   \
1961   } while (0)
1962
1963 /* The following three macros produce codes for indicating direction
1964    of text.  */
1965 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1966   do {                                                  \
1967     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1968       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1969     else                                                \
1970       *dst++ = ISO_CODE_CSI;                            \
1971   } while (0)
1972
1973 #define ENCODE_DIRECTION_R2L    \
1974   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1975
1976 #define ENCODE_DIRECTION_L2R    \
1977   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1978
1979 /* Produce codes for designation and invocation to reset the graphic
1980    planes and registers to initial state.  */
1981 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1982   do {                                                                      \
1983     int reg;                                                                \
1984     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1985       ENCODE_SHIFT_IN;                                                      \
1986     for (reg = 0; reg < 4; reg++)                                           \
1987       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1988           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1989               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1990         ENCODE_DESIGNATION                                                  \
1991           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1992   } while (0)
1993
1994 /* Produce designation sequences of charsets in the line started from
1995    SRC to a place pointed by DST, and return updated DST.
1996
1997    If the current block ends before any end-of-line, we may fail to
1998    find all the necessary designations.  */
1999
2000 static unsigned char *
2001 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2002      struct coding_system *coding;
2003      Lisp_Object translation_table;
2004      unsigned char *src, *src_end, *dst;
2005 {
2006   int charset, c, found = 0, reg;
2007   /* Table of charsets to be designated to each graphic register.  */
2008   int r[4];
2009
2010   for (reg = 0; reg < 4; reg++)
2011     r[reg] = -1;
2012
2013   while (found < 4)
2014     {
2015       ONE_MORE_CHAR (c);
2016       if (c == '\n')
2017         break;
2018
2019       charset = CHAR_CHARSET (c);
2020       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2021       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2022         {
2023           found++;
2024           r[reg] = charset;
2025         }
2026     }
2027
2028  label_end_of_loop:
2029   if (found)
2030     {
2031       for (reg = 0; reg < 4; reg++)
2032         if (r[reg] >= 0
2033             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2034           ENCODE_DESIGNATION (r[reg], reg, coding);
2035     }
2036
2037   return dst;
2038 }
2039
2040 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2041
2042 static void
2043 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2044      struct coding_system *coding;
2045      unsigned char *source, *destination;
2046      int src_bytes, dst_bytes;
2047 {
2048   unsigned char *src = source;
2049   unsigned char *src_end = source + src_bytes;
2050   unsigned char *dst = destination;
2051   unsigned char *dst_end = destination + dst_bytes;
2052   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2053      from DST_END to assure overflow checking is necessary only at the
2054      head of loop.  */
2055   unsigned char *adjusted_dst_end = dst_end - 19;
2056   /* SRC_BASE remembers the start position in source in each loop.
2057      The loop will be exited when there's not enough source text to
2058      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2059      there's not enough destination area to produce encoded codes
2060      (within macro EMIT_BYTES).  */
2061   unsigned char *src_base;
2062   int c;
2063   Lisp_Object translation_table;
2064   Lisp_Object safe_chars;
2065
2066   safe_chars = coding_safe_chars (coding);
2067
2068   if (NILP (Venable_character_translation))
2069     translation_table = Qnil;
2070   else
2071     {
2072       translation_table = coding->translation_table_for_encode;
2073       if (NILP (translation_table))
2074         translation_table = Vstandard_translation_table_for_encode;
2075     }
2076
2077   coding->consumed_char = 0;
2078   coding->errors = 0;
2079   while (1)
2080     {
2081       src_base = src;
2082
2083       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2084         {
2085           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2086           break;
2087         }
2088
2089       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2090           && CODING_SPEC_ISO_BOL (coding))
2091         {
2092           /* We have to produce designation sequences if any now.  */
2093           dst = encode_designation_at_bol (coding, translation_table,
2094                                            src, src_end, dst);
2095           CODING_SPEC_ISO_BOL (coding) = 0;
2096         }
2097
2098       /* Check composition start and end.  */
2099       if (coding->composing != COMPOSITION_DISABLED
2100           && coding->cmp_data_start < coding->cmp_data->used)
2101         {
2102           struct composition_data *cmp_data = coding->cmp_data;
2103           int *data = cmp_data->data + coding->cmp_data_start;
2104           int this_pos = cmp_data->char_offset + coding->consumed_char;
2105
2106           if (coding->composing == COMPOSITION_RELATIVE)
2107             {
2108               if (this_pos == data[2])
2109                 {
2110                   ENCODE_COMPOSITION_END (coding, data);
2111                   cmp_data = coding->cmp_data;
2112                   data = cmp_data->data + coding->cmp_data_start;
2113                 }
2114             }
2115           else if (COMPOSING_P (coding))
2116             {
2117               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2118               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2119                 /* We have consumed components of the composition.
2120                    What follows in SRC is the compositions's base
2121                    text.  */
2122                 ENCODE_COMPOSITION_FAKE_START (coding);
2123               else
2124                 {
2125                   int c = cmp_data->data[coding->cmp_data_index++];
2126                   if (coding->composition_rule_follows)
2127                     {
2128                       ENCODE_COMPOSITION_RULE (c);
2129                       coding->composition_rule_follows = 0;
2130                     }
2131                   else
2132                     {
2133                       if (coding->flags & CODING_FLAG_ISO_SAFE
2134                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2135                         ENCODE_UNSAFE_CHARACTER (c);
2136                       else
2137                         ENCODE_ISO_CHARACTER (c);
2138                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2139                         coding->composition_rule_follows = 1;
2140                     }
2141                   continue;
2142                 }
2143             }
2144           if (!COMPOSING_P (coding))
2145             {
2146               if (this_pos == data[1])
2147                 {
2148                   ENCODE_COMPOSITION_START (coding, data);
2149                   continue;
2150                 }
2151             }
2152         }
2153
2154       ONE_MORE_CHAR (c);
2155
2156       /* Now encode the character C.  */
2157       if (c < 0x20 || c == 0x7F)
2158         {
2159           if (c == '\r')
2160             {
2161               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2162                 {
2163                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2164                     ENCODE_RESET_PLANE_AND_REGISTER;
2165                   *dst++ = c;
2166                   continue;
2167                 }
2168               /* fall down to treat '\r' as '\n' ...  */
2169               c = '\n';
2170             }
2171           if (c == '\n')
2172             {
2173               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2174                 ENCODE_RESET_PLANE_AND_REGISTER;
2175               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2176                 bcopy (coding->spec.iso2022.initial_designation,
2177                        coding->spec.iso2022.current_designation,
2178                        sizeof coding->spec.iso2022.initial_designation);
2179               if (coding->eol_type == CODING_EOL_LF
2180                   || coding->eol_type == CODING_EOL_UNDECIDED)
2181                 *dst++ = ISO_CODE_LF;
2182               else if (coding->eol_type == CODING_EOL_CRLF)
2183                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2184               else
2185                 *dst++ = ISO_CODE_CR;
2186               CODING_SPEC_ISO_BOL (coding) = 1;
2187             }
2188           else
2189             {
2190               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2191                 ENCODE_RESET_PLANE_AND_REGISTER;
2192               *dst++ = c;
2193             }
2194         }
2195       else if (ASCII_BYTE_P (c))
2196         ENCODE_ISO_CHARACTER (c);
2197       else if (SINGLE_BYTE_CHAR_P (c))
2198         {
2199           *dst++ = c;
2200           coding->errors++;
2201         }
2202       else if (coding->flags & CODING_FLAG_ISO_SAFE
2203                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2204         ENCODE_UNSAFE_CHARACTER (c);
2205       else
2206         ENCODE_ISO_CHARACTER (c);
2207
2208       coding->consumed_char++;
2209     }
2210
2211  label_end_of_loop:
2212   coding->consumed = src_base - source;
2213   coding->produced = coding->produced_char = dst - destination;
2214 }
2215
2216 \f
2217 /*** 4. SJIS and BIG5 handlers ***/
2218
2219 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2220    quite widely.  So, for the moment, Emacs supports them in the bare
2221    C code.  But, in the future, they may be supported only by CCL.  */
2222
2223 /* SJIS is a coding system encoding three character sets: ASCII, right
2224    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2225    as is.  A character of charset katakana-jisx0201 is encoded by
2226    "position-code + 0x80".  A character of charset japanese-jisx0208
2227    is encoded in 2-byte but two position-codes are divided and shifted
2228    so that it fit in the range below.
2229
2230    --- CODE RANGE of SJIS ---
2231    (character set)      (range)
2232    ASCII                0x00 .. 0x7F
2233    KATAKANA-JISX0201    0xA0 .. 0xDF
2234    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2235             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2236    -------------------------------
2237
2238 */
2239
2240 /* BIG5 is a coding system encoding two character sets: ASCII and
2241    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2242    character set and is encoded in two-byte.
2243
2244    --- CODE RANGE of BIG5 ---
2245    (character set)      (range)
2246    ASCII                0x00 .. 0x7F
2247    Big5 (1st byte)      0xA1 .. 0xFE
2248         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2249    --------------------------
2250
2251    Since the number of characters in Big5 is larger than maximum
2252    characters in Emacs' charset (96x96), it can't be handled as one
2253    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2254    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2255    contains frequently used characters and the latter contains less
2256    frequently used characters.  */
2257
2258 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2259    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2260    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2261    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2262
2263 /* Number of Big5 characters which have the same code in 1st byte.  */
2264 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2265
2266 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2267   do {                                                                  \
2268     unsigned int temp                                                   \
2269       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2270     if (b1 < 0xC9)                                                      \
2271       charset = charset_big5_1;                                         \
2272     else                                                                \
2273       {                                                                 \
2274         charset = charset_big5_2;                                       \
2275         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2276       }                                                                 \
2277     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2278     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2279   } while (0)
2280
2281 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2282   do {                                                                  \
2283     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2284     if (charset == charset_big5_2)                                      \
2285       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2286     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2287     b2 = temp % BIG5_SAME_ROW;                                          \
2288     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2289   } while (0)
2290
2291 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2292    Check if a text is encoded in SJIS.  If it is, return
2293    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2294
2295 int
2296 detect_coding_sjis (src, src_end)
2297      unsigned char *src, *src_end;
2298 {
2299   int c;
2300   /* Dummy for ONE_MORE_BYTE.  */
2301   struct coding_system dummy_coding;
2302   struct coding_system *coding = &dummy_coding;
2303
2304   while (1)
2305     {
2306       ONE_MORE_BYTE (c);
2307       if (c >= 0x81)
2308         {
2309           if (c <= 0x9F || (c >= 0xE0 && c <= 0xEF))
2310             {
2311               ONE_MORE_BYTE (c);
2312               if (c < 0x40 || c == 0x7F || c > 0xFC)
2313                 return 0;
2314             }
2315           else if (c > 0xDF)
2316             return 0;
2317         }
2318     }
2319  label_end_of_loop:
2320   return CODING_CATEGORY_MASK_SJIS;
2321 }
2322
2323 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2324    Check if a text is encoded in BIG5.  If it is, return
2325    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2326
2327 int
2328 detect_coding_big5 (src, src_end)
2329      unsigned char *src, *src_end;
2330 {
2331   int c;
2332   /* Dummy for ONE_MORE_BYTE.  */
2333   struct coding_system dummy_coding;
2334   struct coding_system *coding = &dummy_coding;
2335
2336   while (1)
2337     {
2338       ONE_MORE_BYTE (c);
2339       if (c >= 0xA1)
2340         {
2341           ONE_MORE_BYTE (c);
2342           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2343             return 0;
2344         }
2345     }
2346  label_end_of_loop:
2347   return CODING_CATEGORY_MASK_BIG5;
2348 }
2349
2350 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2351    Check if a text is encoded in UTF-8.  If it is, return
2352    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2353
2354 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2355 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2356 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2357 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2358 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2359 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2360 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2361
2362 int
2363 detect_coding_utf_8 (src, src_end)
2364      unsigned char *src, *src_end;
2365 {
2366   unsigned char c;
2367   int seq_maybe_bytes;
2368   /* Dummy for ONE_MORE_BYTE.  */
2369   struct coding_system dummy_coding;
2370   struct coding_system *coding = &dummy_coding;
2371
2372   while (1)
2373     {
2374       ONE_MORE_BYTE (c);
2375       if (UTF_8_1_OCTET_P (c))
2376         continue;
2377       else if (UTF_8_2_OCTET_LEADING_P (c))
2378         seq_maybe_bytes = 1;
2379       else if (UTF_8_3_OCTET_LEADING_P (c))
2380         seq_maybe_bytes = 2;
2381       else if (UTF_8_4_OCTET_LEADING_P (c))
2382         seq_maybe_bytes = 3;
2383       else if (UTF_8_5_OCTET_LEADING_P (c))
2384         seq_maybe_bytes = 4;
2385       else if (UTF_8_6_OCTET_LEADING_P (c))
2386         seq_maybe_bytes = 5;
2387       else
2388         return 0;
2389
2390       do
2391         {
2392           ONE_MORE_BYTE (c);
2393           if (!UTF_8_EXTRA_OCTET_P (c))
2394             return 0;
2395           seq_maybe_bytes--;
2396         }
2397       while (seq_maybe_bytes > 0);
2398     }
2399
2400  label_end_of_loop:
2401   return CODING_CATEGORY_MASK_UTF_8;
2402 }
2403
2404 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2405    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2406    Little Endian (otherwise).  If it is, return
2407    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2408    else return 0.  */
2409
2410 #define UTF_16_INVALID_P(val)   \
2411   (((val) == 0xFFFE)            \
2412    || ((val) == 0xFFFF))
2413
2414 #define UTF_16_HIGH_SURROGATE_P(val) \
2415   (((val) & 0xD800) == 0xD800)
2416
2417 #define UTF_16_LOW_SURROGATE_P(val) \
2418   (((val) & 0xDC00) == 0xDC00)
2419
2420 int
2421 detect_coding_utf_16 (src, src_end)
2422      unsigned char *src, *src_end;
2423 {
2424   unsigned char c1, c2;
2425   /* Dummy for TWO_MORE_BYTES.  */
2426   struct coding_system dummy_coding;
2427   struct coding_system *coding = &dummy_coding;
2428
2429   TWO_MORE_BYTES (c1, c2);
2430
2431   if ((c1 == 0xFF) && (c2 == 0xFE))
2432     return CODING_CATEGORY_MASK_UTF_16_LE;
2433   else if ((c1 == 0xFE) && (c2 == 0xFF))
2434     return CODING_CATEGORY_MASK_UTF_16_BE;
2435
2436  label_end_of_loop:
2437   return 0;
2438 }
2439
2440 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2441    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2442
2443 static void
2444 decode_coding_sjis_big5 (coding, source, destination,
2445                          src_bytes, dst_bytes, sjis_p)
2446      struct coding_system *coding;
2447      unsigned char *source, *destination;
2448      int src_bytes, dst_bytes;
2449      int sjis_p;
2450 {
2451   unsigned char *src = source;
2452   unsigned char *src_end = source + src_bytes;
2453   unsigned char *dst = destination;
2454   unsigned char *dst_end = destination + dst_bytes;
2455   /* SRC_BASE remembers the start position in source in each loop.
2456      The loop will be exited when there's not enough source code
2457      (within macro ONE_MORE_BYTE), or when there's not enough
2458      destination area to produce a character (within macro
2459      EMIT_CHAR).  */
2460   unsigned char *src_base;
2461   Lisp_Object translation_table;
2462
2463   if (NILP (Venable_character_translation))
2464     translation_table = Qnil;
2465   else
2466     {
2467       translation_table = coding->translation_table_for_decode;
2468       if (NILP (translation_table))
2469         translation_table = Vstandard_translation_table_for_decode;
2470     }
2471
2472   coding->produced_char = 0;
2473   while (1)
2474     {
2475       int c, charset, c1, c2;
2476
2477       src_base = src;
2478       ONE_MORE_BYTE (c1);
2479
2480       if (c1 < 0x80)
2481         {
2482           charset = CHARSET_ASCII;
2483           if (c1 < 0x20)
2484             {
2485               if (c1 == '\r')
2486                 {
2487                   if (coding->eol_type == CODING_EOL_CRLF)
2488                     {
2489                       ONE_MORE_BYTE (c2);
2490                       if (c2 == '\n')
2491                         c1 = c2;
2492                       else if (coding->mode
2493                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2494                         {
2495                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2496                           goto label_end_of_loop;
2497                         }
2498                       else
2499                         /* To process C2 again, SRC is subtracted by 1.  */
2500                         src--;
2501                     }
2502                   else if (coding->eol_type == CODING_EOL_CR)
2503                     c1 = '\n';
2504                 }
2505               else if (c1 == '\n'
2506                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2507                        && (coding->eol_type == CODING_EOL_CR
2508                            || coding->eol_type == CODING_EOL_CRLF))
2509                 {
2510                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2511                   goto label_end_of_loop;
2512                 }
2513             }
2514         }
2515       else
2516         {
2517           if (sjis_p)
2518             {
2519               if (c1 >= 0xF0)
2520                 goto label_invalid_code;
2521               if (c1 < 0xA0 || c1 >= 0xE0)
2522                 {
2523                   /* SJIS -> JISX0208 */
2524                   ONE_MORE_BYTE (c2);
2525                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2526                     goto label_invalid_code;
2527                   DECODE_SJIS (c1, c2, c1, c2);
2528                   charset = charset_jisx0208;
2529                 }
2530               else
2531                 /* SJIS -> JISX0201-Kana */
2532                 charset = charset_katakana_jisx0201;
2533             }
2534           else
2535             {
2536               /* BIG5 -> Big5 */
2537               if (c1 < 0xA1 || c1 > 0xFE)
2538                 goto label_invalid_code;
2539               ONE_MORE_BYTE (c2);
2540               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2541                 goto label_invalid_code;
2542               DECODE_BIG5 (c1, c2, charset, c1, c2);
2543             }
2544         }
2545
2546       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2547       EMIT_CHAR (c);
2548       continue;
2549
2550     label_invalid_code:
2551       coding->errors++;
2552       src = src_base;
2553       c = *src++;
2554       EMIT_CHAR (c);
2555     }
2556
2557  label_end_of_loop:
2558   coding->consumed = coding->consumed_char = src_base - source;
2559   coding->produced = dst - destination;
2560   return;
2561 }
2562
2563 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2564    This function can encode charsets `ascii', `katakana-jisx0201',
2565    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2566    are sure that all these charsets are registered as official charset
2567    (i.e. do not have extended leading-codes).  Characters of other
2568    charsets are produced without any encoding.  If SJIS_P is 1, encode
2569    SJIS text, else encode BIG5 text.  */
2570
2571 static void
2572 encode_coding_sjis_big5 (coding, source, destination,
2573                          src_bytes, dst_bytes, sjis_p)
2574      struct coding_system *coding;
2575      unsigned char *source, *destination;
2576      int src_bytes, dst_bytes;
2577      int sjis_p;
2578 {
2579   unsigned char *src = source;
2580   unsigned char *src_end = source + src_bytes;
2581   unsigned char *dst = destination;
2582   unsigned char *dst_end = destination + dst_bytes;
2583   /* SRC_BASE remembers the start position in source in each loop.
2584      The loop will be exited when there's not enough source text to
2585      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2586      there's not enough destination area to produce encoded codes
2587      (within macro EMIT_BYTES).  */
2588   unsigned char *src_base;
2589   Lisp_Object translation_table;
2590
2591   if (NILP (Venable_character_translation))
2592     translation_table = Qnil;
2593   else
2594     {
2595       translation_table = coding->translation_table_for_encode;
2596       if (NILP (translation_table))
2597         translation_table = Vstandard_translation_table_for_encode;
2598     }
2599
2600   while (1)
2601     {
2602       int c, charset, c1, c2;
2603
2604       src_base = src;
2605       ONE_MORE_CHAR (c);
2606
2607       /* Now encode the character C.  */
2608       if (SINGLE_BYTE_CHAR_P (c))
2609         {
2610           switch (c)
2611             {
2612             case '\r':
2613               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2614                 {
2615                   EMIT_ONE_BYTE (c);
2616                   break;
2617                 }
2618               c = '\n';
2619             case '\n':
2620               if (coding->eol_type == CODING_EOL_CRLF)
2621                 {
2622                   EMIT_TWO_BYTES ('\r', c);
2623                   break;
2624                 }
2625               else if (coding->eol_type == CODING_EOL_CR)
2626                 c = '\r';
2627             default:
2628               EMIT_ONE_BYTE (c);
2629             }
2630         }
2631       else
2632         {
2633           SPLIT_CHAR (c, charset, c1, c2);
2634           if (sjis_p)
2635             {
2636               if (charset == charset_jisx0208
2637                   || charset == charset_jisx0208_1978)
2638                 {
2639                   ENCODE_SJIS (c1, c2, c1, c2);
2640                   EMIT_TWO_BYTES (c1, c2);
2641                 }
2642               else if (charset == charset_katakana_jisx0201)
2643                 EMIT_ONE_BYTE (c1 | 0x80);
2644               else if (charset == charset_latin_jisx0201)
2645                 EMIT_ONE_BYTE (c1);
2646               else
2647                 /* There's no way other than producing the internal
2648                    codes as is.  */
2649                 EMIT_BYTES (src_base, src);
2650             }
2651           else
2652             {
2653               if (charset == charset_big5_1 || charset == charset_big5_2)
2654                 {
2655                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2656                   EMIT_TWO_BYTES (c1, c2);
2657                 }
2658               else
2659                 /* There's no way other than producing the internal
2660                    codes as is.  */
2661                 EMIT_BYTES (src_base, src);
2662             }
2663         }
2664       coding->consumed_char++;
2665     }
2666
2667  label_end_of_loop:
2668   coding->consumed = src_base - source;
2669   coding->produced = coding->produced_char = dst - destination;
2670 }
2671
2672 \f
2673 /*** 5. CCL handlers ***/
2674
2675 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2676    Check if a text is encoded in a coding system of which
2677    encoder/decoder are written in CCL program.  If it is, return
2678    CODING_CATEGORY_MASK_CCL, else return 0.  */
2679
2680 int
2681 detect_coding_ccl (src, src_end)
2682      unsigned char *src, *src_end;
2683 {
2684   unsigned char *valid;
2685   int c;
2686   /* Dummy for ONE_MORE_BYTE.  */
2687   struct coding_system dummy_coding;
2688   struct coding_system *coding = &dummy_coding;
2689
2690   /* No coding system is assigned to coding-category-ccl.  */
2691   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2692     return 0;
2693
2694   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2695   while (1)
2696     {
2697       ONE_MORE_BYTE (c);
2698       if (! valid[c])
2699         return 0;
2700     }
2701  label_end_of_loop:
2702   return CODING_CATEGORY_MASK_CCL;
2703 }
2704
2705 \f
2706 /*** 6. End-of-line handlers ***/
2707
2708 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2709
2710 static void
2711 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2712      struct coding_system *coding;
2713      unsigned char *source, *destination;
2714      int src_bytes, dst_bytes;
2715 {
2716   unsigned char *src = source;
2717   unsigned char *dst = destination;
2718   unsigned char *src_end = src + src_bytes;
2719   unsigned char *dst_end = dst + dst_bytes;
2720   Lisp_Object translation_table;
2721   /* SRC_BASE remembers the start position in source in each loop.
2722      The loop will be exited when there's not enough source code
2723      (within macro ONE_MORE_BYTE), or when there's not enough
2724      destination area to produce a character (within macro
2725      EMIT_CHAR).  */
2726   unsigned char *src_base;
2727   int c;
2728
2729   translation_table = Qnil;
2730   switch (coding->eol_type)
2731     {
2732     case CODING_EOL_CRLF:
2733       while (1)
2734         {
2735           src_base = src;
2736           ONE_MORE_BYTE (c);
2737           if (c == '\r')
2738             {
2739               ONE_MORE_BYTE (c);
2740               if (c != '\n')
2741                 {
2742                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2743                     {
2744                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2745                       goto label_end_of_loop;
2746                     }
2747                   src--;
2748                   c = '\r';
2749                 }
2750             }
2751           else if (c == '\n'
2752                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2753             {
2754               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2755               goto label_end_of_loop;
2756             }
2757           EMIT_CHAR (c);
2758         }
2759       break;
2760
2761     case CODING_EOL_CR:
2762       while (1)
2763         {
2764           src_base = src;
2765           ONE_MORE_BYTE (c);
2766           if (c == '\n')
2767             {
2768               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2769                 {
2770                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2771                   goto label_end_of_loop;
2772                 }
2773             }
2774           else if (c == '\r')
2775             c = '\n';
2776           EMIT_CHAR (c);
2777         }
2778       break;
2779
2780     default:                    /* no need for EOL handling */
2781       while (1)
2782         {
2783           src_base = src;
2784           ONE_MORE_BYTE (c);
2785           EMIT_CHAR (c);
2786         }
2787     }
2788
2789  label_end_of_loop:
2790   coding->consumed = coding->consumed_char = src_base - source;
2791   coding->produced = dst - destination;
2792   return;
2793 }
2794
2795 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2796    format of end-of-line according to `coding->eol_type'.  It also
2797    convert multibyte form 8-bit characers to unibyte if
2798    CODING->src_multibyte is nonzero.  If `coding->mode &
2799    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2800    also means end-of-line.  */
2801
2802 static void
2803 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2804      struct coding_system *coding;
2805      unsigned char *source, *destination;
2806      int src_bytes, dst_bytes;
2807 {
2808   unsigned char *src = source;
2809   unsigned char *dst = destination;
2810   unsigned char *src_end = src + src_bytes;
2811   unsigned char *dst_end = dst + dst_bytes;
2812   Lisp_Object translation_table;
2813   /* SRC_BASE remembers the start position in source in each loop.
2814      The loop will be exited when there's not enough source text to
2815      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2816      there's not enough destination area to produce encoded codes
2817      (within macro EMIT_BYTES).  */
2818   unsigned char *src_base;
2819   int c;
2820   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2821
2822   translation_table = Qnil;
2823   if (coding->src_multibyte
2824       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2825     {
2826       src_end--;
2827       src_bytes--;
2828       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2829     }
2830
2831   if (coding->eol_type == CODING_EOL_CRLF)
2832     {
2833       while (src < src_end)
2834         {
2835           src_base = src;
2836           c = *src++;
2837           if (c >= 0x20)
2838             EMIT_ONE_BYTE (c);
2839           else if (c == '\n' || (c == '\r' && selective_display))
2840             EMIT_TWO_BYTES ('\r', '\n');
2841           else
2842             EMIT_ONE_BYTE (c);
2843         }
2844       src_base = src;
2845     label_end_of_loop:
2846       ;
2847     }
2848   else
2849     {
2850       if (!dst_bytes || src_bytes <= dst_bytes)
2851         {
2852           safe_bcopy (src, dst, src_bytes);
2853           src_base = src_end;
2854           dst += src_bytes;
2855         }
2856       else
2857         {
2858           if (coding->src_multibyte
2859               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2860             dst_bytes--;
2861           safe_bcopy (src, dst, dst_bytes);
2862           src_base = src + dst_bytes;
2863           dst = destination + dst_bytes;
2864           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2865         }
2866       if (coding->eol_type == CODING_EOL_CR)
2867         {
2868           for (src = destination; src < dst; src++)
2869             if (*src == '\n') *src = '\r';
2870         }
2871       else if (selective_display)
2872         {
2873           for (src = destination; src < dst; src++)
2874             if (*src == '\r') *src = '\n';
2875         }
2876     }
2877   if (coding->src_multibyte)
2878     dst = destination + str_as_unibyte (destination, dst - destination);
2879
2880   coding->consumed = src_base - source;
2881   coding->produced = dst - destination;
2882   coding->produced_char = coding->produced;
2883 }
2884
2885 \f
2886 /*** 7. C library functions ***/
2887
2888 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2889    has a property `coding-system'.  The value of this property is a
2890    vector of length 5 (called as coding-vector).  Among elements of
2891    this vector, the first (element[0]) and the fifth (element[4])
2892    carry important information for decoding/encoding.  Before
2893    decoding/encoding, this information should be set in fields of a
2894    structure of type `coding_system'.
2895
2896    A value of property `coding-system' can be a symbol of another
2897    subsidiary coding-system.  In that case, Emacs gets coding-vector
2898    from that symbol.
2899
2900    `element[0]' contains information to be set in `coding->type'.  The
2901    value and its meaning is as follows:
2902
2903    0 -- coding_type_emacs_mule
2904    1 -- coding_type_sjis
2905    2 -- coding_type_iso2022
2906    3 -- coding_type_big5
2907    4 -- coding_type_ccl encoder/decoder written in CCL
2908    nil -- coding_type_no_conversion
2909    t -- coding_type_undecided (automatic conversion on decoding,
2910                                no-conversion on encoding)
2911
2912    `element[4]' contains information to be set in `coding->flags' and
2913    `coding->spec'.  The meaning varies by `coding->type'.
2914
2915    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2916    of length 32 (of which the first 13 sub-elements are used now).
2917    Meanings of these sub-elements are:
2918
2919    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2920         If the value is an integer of valid charset, the charset is
2921         assumed to be designated to graphic register N initially.
2922
2923         If the value is minus, it is a minus value of charset which
2924         reserves graphic register N, which means that the charset is
2925         not designated initially but should be designated to graphic
2926         register N just before encoding a character in that charset.
2927
2928         If the value is nil, graphic register N is never used on
2929         encoding.
2930
2931    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2932         Each value takes t or nil.  See the section ISO2022 of
2933         `coding.h' for more information.
2934
2935    If `coding->type' is `coding_type_big5', element[4] is t to denote
2936    BIG5-ETen or nil to denote BIG5-HKU.
2937
2938    If `coding->type' takes the other value, element[4] is ignored.
2939
2940    Emacs Lisp's coding system also carries information about format of
2941    end-of-line in a value of property `eol-type'.  If the value is
2942    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2943    means CODING_EOL_CR.  If it is not integer, it should be a vector
2944    of subsidiary coding systems of which property `eol-type' has one
2945    of above values.
2946
2947 */
2948
2949 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2950    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2951    is setup so that no conversion is necessary and return -1, else
2952    return 0.  */
2953
2954 int
2955 setup_coding_system (coding_system, coding)
2956      Lisp_Object coding_system;
2957      struct coding_system *coding;
2958 {
2959   Lisp_Object coding_spec, coding_type, eol_type, plist;
2960   Lisp_Object val;
2961   int i;
2962
2963   /* Initialize some fields required for all kinds of coding systems.  */
2964   coding->symbol = coding_system;
2965   coding->common_flags = 0;
2966   coding->mode = 0;
2967   coding->heading_ascii = -1;
2968   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2969   coding->composing = COMPOSITION_DISABLED;
2970   coding->cmp_data = NULL;
2971
2972   if (NILP (coding_system))
2973     goto label_invalid_coding_system;
2974
2975   coding_spec = Fget (coding_system, Qcoding_system);
2976
2977   if (!VECTORP (coding_spec)
2978       || XVECTOR (coding_spec)->size != 5
2979       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2980     goto label_invalid_coding_system;
2981
2982   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2983   if (VECTORP (eol_type))
2984     {
2985       coding->eol_type = CODING_EOL_UNDECIDED;
2986       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2987     }
2988   else if (XFASTINT (eol_type) == 1)
2989     {
2990       coding->eol_type = CODING_EOL_CRLF;
2991       coding->common_flags
2992         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2993     }
2994   else if (XFASTINT (eol_type) == 2)
2995     {
2996       coding->eol_type = CODING_EOL_CR;
2997       coding->common_flags
2998         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2999     }
3000   else
3001     coding->eol_type = CODING_EOL_LF;
3002
3003   coding_type = XVECTOR (coding_spec)->contents[0];
3004   /* Try short cut.  */
3005   if (SYMBOLP (coding_type))
3006     {
3007       if (EQ (coding_type, Qt))
3008         {
3009           coding->type = coding_type_undecided;
3010           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3011         }
3012       else
3013         coding->type = coding_type_no_conversion;
3014       /* Initialize this member.  Any thing other than
3015          CODING_CATEGORY_IDX_UTF_16_BE and
3016          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3017          special treatment in detect_eol.  */
3018       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3019
3020       return 0;
3021     }
3022
3023   /* Get values of coding system properties:
3024      `post-read-conversion', `pre-write-conversion',
3025      `translation-table-for-decode', `translation-table-for-encode'.  */
3026   plist = XVECTOR (coding_spec)->contents[3];
3027   /* Pre & post conversion functions should be disabled if
3028      inhibit_eol_conversion is nozero.  This is the case that a code
3029      conversion function is called while those functions are running.  */
3030   if (! inhibit_pre_post_conversion)
3031     {
3032       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3033       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3034     }
3035   val = Fplist_get (plist, Qtranslation_table_for_decode);
3036   if (SYMBOLP (val))
3037     val = Fget (val, Qtranslation_table_for_decode);
3038   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3039   val = Fplist_get (plist, Qtranslation_table_for_encode);
3040   if (SYMBOLP (val))
3041     val = Fget (val, Qtranslation_table_for_encode);
3042   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3043   val = Fplist_get (plist, Qcoding_category);
3044   if (!NILP (val))
3045     {
3046       val = Fget (val, Qcoding_category_index);
3047       if (INTEGERP (val))
3048         coding->category_idx = XINT (val);
3049       else
3050         goto label_invalid_coding_system;
3051     }
3052   else
3053     goto label_invalid_coding_system;
3054
3055   /* If the coding system has non-nil `composition' property, enable
3056      composition handling.  */
3057   val = Fplist_get (plist, Qcomposition);
3058   if (!NILP (val))
3059     coding->composing = COMPOSITION_NO;
3060
3061   switch (XFASTINT (coding_type))
3062     {
3063     case 0:
3064       coding->type = coding_type_emacs_mule;
3065       if (!NILP (coding->post_read_conversion))
3066         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3067       if (!NILP (coding->pre_write_conversion))
3068         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3069       break;
3070
3071     case 1:
3072       coding->type = coding_type_sjis;
3073       coding->common_flags
3074         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3075       break;
3076
3077     case 2:
3078       coding->type = coding_type_iso2022;
3079       coding->common_flags
3080         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3081       {
3082         Lisp_Object val, temp;
3083         Lisp_Object *flags;
3084         int i, charset, reg_bits = 0;
3085
3086         val = XVECTOR (coding_spec)->contents[4];
3087
3088         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3089           goto label_invalid_coding_system;
3090
3091         flags = XVECTOR (val)->contents;
3092         coding->flags
3093           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3094              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3095              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3096              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3097              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3098              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3099              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3100              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3101              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3102              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3103              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3104              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3105              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3106              );
3107
3108         /* Invoke graphic register 0 to plane 0.  */
3109         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3110         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3111         CODING_SPEC_ISO_INVOCATION (coding, 1)
3112           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3113         /* Not single shifting at first.  */
3114         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3115         /* Beginning of buffer should also be regarded as bol. */
3116         CODING_SPEC_ISO_BOL (coding) = 1;
3117
3118         for (charset = 0; charset <= MAX_CHARSET; charset++)
3119           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3120         val = Vcharset_revision_alist;
3121         while (CONSP (val))
3122           {
3123             charset = get_charset_id (Fcar_safe (XCAR (val)));
3124             if (charset >= 0
3125                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3126                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3127               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3128             val = XCDR (val);
3129           }
3130
3131         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3132            FLAGS[REG] can be one of below:
3133                 integer CHARSET: CHARSET occupies register I,
3134                 t: designate nothing to REG initially, but can be used
3135                   by any charsets,
3136                 list of integer, nil, or t: designate the first
3137                   element (if integer) to REG initially, the remaining
3138                   elements (if integer) is designated to REG on request,
3139                   if an element is t, REG can be used by any charsets,
3140                 nil: REG is never used.  */
3141         for (charset = 0; charset <= MAX_CHARSET; charset++)
3142           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3143             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3144         for (i = 0; i < 4; i++)
3145           {
3146             if (INTEGERP (flags[i])
3147                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3148                 || (charset = get_charset_id (flags[i])) >= 0)
3149               {
3150                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3151                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3152               }
3153             else if (EQ (flags[i], Qt))
3154               {
3155                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3156                 reg_bits |= 1 << i;
3157                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3158               }
3159             else if (CONSP (flags[i]))
3160               {
3161                 Lisp_Object tail;
3162                 tail = flags[i];
3163
3164                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3165                 if (INTEGERP (XCAR (tail))
3166                     && (charset = XINT (XCAR (tail)),
3167                         CHARSET_VALID_P (charset))
3168                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3169                   {
3170                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3171                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3172                   }
3173                 else
3174                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3175                 tail = XCDR (tail);
3176                 while (CONSP (tail))
3177                   {
3178                     if (INTEGERP (XCAR (tail))
3179                         && (charset = XINT (XCAR (tail)),
3180                             CHARSET_VALID_P (charset))
3181                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3182                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3183                         = i;
3184                     else if (EQ (XCAR (tail), Qt))
3185                       reg_bits |= 1 << i;
3186                     tail = XCDR (tail);
3187                   }
3188               }
3189             else
3190               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3191
3192             CODING_SPEC_ISO_DESIGNATION (coding, i)
3193               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3194           }
3195
3196         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3197           {
3198             /* REG 1 can be used only by locking shift in 7-bit env.  */
3199             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3200               reg_bits &= ~2;
3201             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3202               /* Without any shifting, only REG 0 and 1 can be used.  */
3203               reg_bits &= 3;
3204           }
3205
3206         if (reg_bits)
3207           for (charset = 0; charset <= MAX_CHARSET; charset++)
3208             {
3209               if (CHARSET_VALID_P (charset)
3210                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3211                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3212                 {
3213                   /* There exist some default graphic registers to be
3214                      used by CHARSET.  */
3215
3216                   /* We had better avoid designating a charset of
3217                      CHARS96 to REG 0 as far as possible.  */
3218                   if (CHARSET_CHARS (charset) == 96)
3219                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3220                       = (reg_bits & 2
3221                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3222                   else
3223                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3224                       = (reg_bits & 1
3225                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3226                 }
3227             }
3228       }
3229       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3230       coding->spec.iso2022.last_invalid_designation_register = -1;
3231       break;
3232
3233     case 3:
3234       coding->type = coding_type_big5;
3235       coding->common_flags
3236         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3237       coding->flags
3238         = (NILP (XVECTOR (coding_spec)->contents[4])
3239            ? CODING_FLAG_BIG5_HKU
3240            : CODING_FLAG_BIG5_ETEN);
3241       break;
3242
3243     case 4:
3244       coding->type = coding_type_ccl;
3245       coding->common_flags
3246         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3247       {
3248         val = XVECTOR (coding_spec)->contents[4];
3249         if (! CONSP (val)
3250             || setup_ccl_program (&(coding->spec.ccl.decoder),
3251                                   XCAR (val)) < 0
3252             || setup_ccl_program (&(coding->spec.ccl.encoder),
3253                                   XCDR (val)) < 0)
3254           goto label_invalid_coding_system;
3255
3256         bzero (coding->spec.ccl.valid_codes, 256);
3257         val = Fplist_get (plist, Qvalid_codes);
3258         if (CONSP (val))
3259           {
3260             Lisp_Object this;
3261
3262             for (; CONSP (val); val = XCDR (val))
3263               {
3264                 this = XCAR (val);
3265                 if (INTEGERP (this)
3266                     && XINT (this) >= 0 && XINT (this) < 256)
3267                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3268                 else if (CONSP (this)
3269                          && INTEGERP (XCAR (this))
3270                          && INTEGERP (XCDR (this)))
3271                   {
3272                     int start = XINT (XCAR (this));
3273                     int end = XINT (XCDR (this));
3274
3275                     if (start >= 0 && start <= end && end < 256)
3276                       while (start <= end)
3277                         coding->spec.ccl.valid_codes[start++] = 1;
3278                   }
3279               }
3280           }
3281       }
3282       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3283       coding->spec.ccl.cr_carryover = 0;
3284       break;
3285
3286     case 5:
3287       coding->type = coding_type_raw_text;
3288       break;
3289
3290     default:
3291       goto label_invalid_coding_system;
3292     }
3293   return 0;
3294
3295  label_invalid_coding_system:
3296   coding->type = coding_type_no_conversion;
3297   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3298   coding->common_flags = 0;
3299   coding->eol_type = CODING_EOL_LF;
3300   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3301   return -1;
3302 }
3303
3304 /* Free memory blocks allocated for storing composition information.  */
3305
3306 void
3307 coding_free_composition_data (coding)
3308      struct coding_system *coding;
3309 {
3310   struct composition_data *cmp_data = coding->cmp_data, *next;
3311
3312   if (!cmp_data)
3313     return;
3314   /* Memory blocks are chained.  At first, rewind to the first, then,
3315      free blocks one by one.  */
3316   while (cmp_data->prev)
3317     cmp_data = cmp_data->prev;
3318   while (cmp_data)
3319     {
3320       next = cmp_data->next;
3321       xfree (cmp_data);
3322       cmp_data = next;
3323     }
3324   coding->cmp_data = NULL;
3325 }
3326
3327 /* Set `char_offset' member of all memory blocks pointed by
3328    coding->cmp_data to POS.  */
3329
3330 void
3331 coding_adjust_composition_offset (coding, pos)
3332      struct coding_system *coding;
3333      int pos;
3334 {
3335   struct composition_data *cmp_data;
3336
3337   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3338     cmp_data->char_offset = pos;
3339 }
3340
3341 /* Setup raw-text or one of its subsidiaries in the structure
3342    coding_system CODING according to the already setup value eol_type
3343    in CODING.  CODING should be setup for some coding system in
3344    advance.  */
3345
3346 void
3347 setup_raw_text_coding_system (coding)
3348      struct coding_system *coding;
3349 {
3350   if (coding->type != coding_type_raw_text)
3351     {
3352       coding->symbol = Qraw_text;
3353       coding->type = coding_type_raw_text;
3354       if (coding->eol_type != CODING_EOL_UNDECIDED)
3355         {
3356           Lisp_Object subsidiaries;
3357           subsidiaries = Fget (Qraw_text, Qeol_type);
3358
3359           if (VECTORP (subsidiaries)
3360               && XVECTOR (subsidiaries)->size == 3)
3361             coding->symbol
3362               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3363         }
3364       setup_coding_system (coding->symbol, coding);
3365     }
3366   return;
3367 }
3368
3369 /* Emacs has a mechanism to automatically detect a coding system if it
3370    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3371    it's impossible to distinguish some coding systems accurately
3372    because they use the same range of codes.  So, at first, coding
3373    systems are categorized into 7, those are:
3374
3375    o coding-category-emacs-mule
3376
3377         The category for a coding system which has the same code range
3378         as Emacs' internal format.  Assigned the coding-system (Lisp
3379         symbol) `emacs-mule' by default.
3380
3381    o coding-category-sjis
3382
3383         The category for a coding system which has the same code range
3384         as SJIS.  Assigned the coding-system (Lisp
3385         symbol) `japanese-shift-jis' by default.
3386
3387    o coding-category-iso-7
3388
3389         The category for a coding system which has the same code range
3390         as ISO2022 of 7-bit environment.  This doesn't use any locking
3391         shift and single shift functions.  This can encode/decode all
3392         charsets.  Assigned the coding-system (Lisp symbol)
3393         `iso-2022-7bit' by default.
3394
3395    o coding-category-iso-7-tight
3396
3397         Same as coding-category-iso-7 except that this can
3398         encode/decode only the specified charsets.
3399
3400    o coding-category-iso-8-1
3401
3402         The category for a coding system which has the same code range
3403         as ISO2022 of 8-bit environment and graphic plane 1 used only
3404         for DIMENSION1 charset.  This doesn't use any locking shift
3405         and single shift functions.  Assigned the coding-system (Lisp
3406         symbol) `iso-latin-1' by default.
3407
3408    o coding-category-iso-8-2
3409
3410         The category for a coding system which has the same code range
3411         as ISO2022 of 8-bit environment and graphic plane 1 used only
3412         for DIMENSION2 charset.  This doesn't use any locking shift
3413         and single shift functions.  Assigned the coding-system (Lisp
3414         symbol) `japanese-iso-8bit' by default.
3415
3416    o coding-category-iso-7-else
3417
3418         The category for a coding system which has the same code range
3419         as ISO2022 of 7-bit environemnt but uses locking shift or
3420         single shift functions.  Assigned the coding-system (Lisp
3421         symbol) `iso-2022-7bit-lock' by default.
3422
3423    o coding-category-iso-8-else
3424
3425         The category for a coding system which has the same code range
3426         as ISO2022 of 8-bit environemnt but uses locking shift or
3427         single shift functions.  Assigned the coding-system (Lisp
3428         symbol) `iso-2022-8bit-ss2' by default.
3429
3430    o coding-category-big5
3431
3432         The category for a coding system which has the same code range
3433         as BIG5.  Assigned the coding-system (Lisp symbol)
3434         `cn-big5' by default.
3435
3436    o coding-category-utf-8
3437
3438         The category for a coding system which has the same code range
3439         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3440         symbol) `utf-8' by default.
3441
3442    o coding-category-utf-16-be
3443
3444         The category for a coding system in which a text has an
3445         Unicode signature (cf. Unicode Standard) in the order of BIG
3446         endian at the head.  Assigned the coding-system (Lisp symbol)
3447         `utf-16-be' by default.
3448
3449    o coding-category-utf-16-le
3450
3451         The category for a coding system in which a text has an
3452         Unicode signature (cf. Unicode Standard) in the order of
3453         LITTLE endian at the head.  Assigned the coding-system (Lisp
3454         symbol) `utf-16-le' by default.
3455
3456    o coding-category-ccl
3457
3458         The category for a coding system of which encoder/decoder is
3459         written in CCL programs.  The default value is nil, i.e., no
3460         coding system is assigned.
3461
3462    o coding-category-binary
3463
3464         The category for a coding system not categorized in any of the
3465         above.  Assigned the coding-system (Lisp symbol)
3466         `no-conversion' by default.
3467
3468    Each of them is a Lisp symbol and the value is an actual
3469    `coding-system's (this is also a Lisp symbol) assigned by a user.
3470    What Emacs does actually is to detect a category of coding system.
3471    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3472    decide only one possible category, it selects a category of the
3473    highest priority.  Priorities of categories are also specified by a
3474    user in a Lisp variable `coding-category-list'.
3475
3476 */
3477
3478 static
3479 int ascii_skip_code[256];
3480
3481 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3482    If it detects possible coding systems, return an integer in which
3483    appropriate flag bits are set.  Flag bits are defined by macros
3484    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3485    it should point the table `coding_priorities'.  In that case, only
3486    the flag bit for a coding system of the highest priority is set in
3487    the returned value.
3488
3489    How many ASCII characters are at the head is returned as *SKIP.  */
3490
3491 static int
3492 detect_coding_mask (source, src_bytes, priorities, skip)
3493      unsigned char *source;
3494      int src_bytes, *priorities, *skip;
3495 {
3496   register unsigned char c;
3497   unsigned char *src = source, *src_end = source + src_bytes;
3498   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3499   int i, idx;
3500
3501   /* At first, skip all ASCII characters and control characters except
3502      for three ISO2022 specific control characters.  */
3503   ascii_skip_code[ISO_CODE_SO] = 0;
3504   ascii_skip_code[ISO_CODE_SI] = 0;
3505   ascii_skip_code[ISO_CODE_ESC] = 0;
3506
3507  label_loop_detect_coding:
3508   while (src < src_end && ascii_skip_code[*src]) src++;
3509   *skip = src - source;
3510
3511   if (src >= src_end)
3512     /* We found nothing other than ASCII.  There's nothing to do.  */
3513     return 0;
3514
3515   c = *src;
3516   /* The text seems to be encoded in some multilingual coding system.
3517      Now, try to find in which coding system the text is encoded.  */
3518   if (c < 0x80)
3519     {
3520       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3521       /* C is an ISO2022 specific control code of C0.  */
3522       mask = detect_coding_iso2022 (src, src_end);
3523       if (mask == 0)
3524         {
3525           /* No valid ISO2022 code follows C.  Try again.  */
3526           src++;
3527           if (c == ISO_CODE_ESC)
3528             ascii_skip_code[ISO_CODE_ESC] = 1;
3529           else
3530             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3531           goto label_loop_detect_coding;
3532         }
3533       if (priorities)
3534         {
3535           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3536             {
3537               if (mask & priorities[i])
3538                 return priorities[i];
3539             }
3540           return CODING_CATEGORY_MASK_RAW_TEXT;
3541         }
3542     }
3543   else
3544     {
3545       int try;
3546
3547       if (c < 0xA0)
3548         {
3549           /* C is the first byte of SJIS character code,
3550              or a leading-code of Emacs' internal format (emacs-mule),
3551              or the first byte of UTF-16.  */
3552           try = (CODING_CATEGORY_MASK_SJIS
3553                   | CODING_CATEGORY_MASK_EMACS_MULE
3554                   | CODING_CATEGORY_MASK_UTF_16_BE
3555                   | CODING_CATEGORY_MASK_UTF_16_LE);
3556
3557           /* Or, if C is a special latin extra code,
3558              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3559              or is an ISO2022 control-sequence-introducer (CSI),
3560              we should also consider the possibility of ISO2022 codings.  */
3561           if ((VECTORP (Vlatin_extra_code_table)
3562                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3563               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3564               || (c == ISO_CODE_CSI
3565                   && (src < src_end
3566                       && (*src == ']'
3567                           || ((*src == '0' || *src == '1' || *src == '2')
3568                               && src + 1 < src_end
3569                               && src[1] == ']')))))
3570             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3571                      | CODING_CATEGORY_MASK_ISO_8BIT);
3572         }
3573       else
3574         /* C is a character of ISO2022 in graphic plane right,
3575            or a SJIS's 1-byte character code (i.e. JISX0201),
3576            or the first byte of BIG5's 2-byte code,
3577            or the first byte of UTF-8/16.  */
3578         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3579                 | CODING_CATEGORY_MASK_ISO_8BIT
3580                 | CODING_CATEGORY_MASK_SJIS
3581                 | CODING_CATEGORY_MASK_BIG5
3582                 | CODING_CATEGORY_MASK_UTF_8
3583                 | CODING_CATEGORY_MASK_UTF_16_BE
3584                 | CODING_CATEGORY_MASK_UTF_16_LE);
3585
3586       /* Or, we may have to consider the possibility of CCL.  */
3587       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3588           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3589               ->spec.ccl.valid_codes)[c])
3590         try |= CODING_CATEGORY_MASK_CCL;
3591
3592       mask = 0;
3593       utf16_examined_p = iso2022_examined_p = 0;
3594       if (priorities)
3595         {
3596           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3597             {
3598               if (!iso2022_examined_p
3599                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3600                 {
3601                   mask |= detect_coding_iso2022 (src, src_end);
3602                   iso2022_examined_p = 1;
3603                 }
3604               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3605                 mask |= detect_coding_sjis (src, src_end);
3606               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3607                 mask |= detect_coding_utf_8 (src, src_end);
3608               else if (!utf16_examined_p
3609                        && (priorities[i] & try &
3610                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3611                 {
3612                   mask |= detect_coding_utf_16 (src, src_end);
3613                   utf16_examined_p = 1;
3614                 }
3615               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3616                 mask |= detect_coding_big5 (src, src_end);
3617               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3618                 mask |= detect_coding_emacs_mule (src, src_end);
3619               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3620                 mask |= detect_coding_ccl (src, src_end);
3621               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3622                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3623               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3624                 mask |= CODING_CATEGORY_MASK_BINARY;
3625               if (mask & priorities[i])
3626                 return priorities[i];
3627             }
3628           return CODING_CATEGORY_MASK_RAW_TEXT;
3629         }
3630       if (try & CODING_CATEGORY_MASK_ISO)
3631         mask |= detect_coding_iso2022 (src, src_end);
3632       if (try & CODING_CATEGORY_MASK_SJIS)
3633         mask |= detect_coding_sjis (src, src_end);
3634       if (try & CODING_CATEGORY_MASK_BIG5)
3635         mask |= detect_coding_big5 (src, src_end);
3636       if (try & CODING_CATEGORY_MASK_UTF_8)
3637         mask |= detect_coding_utf_8 (src, src_end);
3638       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3639         mask |= detect_coding_utf_16 (src, src_end);
3640       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3641         mask |= detect_coding_emacs_mule (src, src_end);
3642       if (try & CODING_CATEGORY_MASK_CCL)
3643         mask |= detect_coding_ccl (src, src_end);
3644     }
3645   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3646 }
3647
3648 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3649    The information of the detected coding system is set in CODING.  */
3650
3651 void
3652 detect_coding (coding, src, src_bytes)
3653      struct coding_system *coding;
3654      unsigned char *src;
3655      int src_bytes;
3656 {
3657   unsigned int idx;
3658   int skip, mask, i;
3659   Lisp_Object val;
3660
3661   val = Vcoding_category_list;
3662   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3663   coding->heading_ascii = skip;
3664
3665   if (!mask) return;
3666
3667   /* We found a single coding system of the highest priority in MASK.  */
3668   idx = 0;
3669   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3670   if (! mask)
3671     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3672
3673   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3674
3675   if (coding->eol_type != CODING_EOL_UNDECIDED)
3676     {
3677       Lisp_Object tmp;
3678
3679       tmp = Fget (val, Qeol_type);
3680       if (VECTORP (tmp))
3681         val = XVECTOR (tmp)->contents[coding->eol_type];
3682     }
3683
3684   /* Setup this new coding system while preserving some slots.  */
3685   {
3686     int src_multibyte = coding->src_multibyte;
3687     int dst_multibyte = coding->dst_multibyte;
3688
3689     setup_coding_system (val, coding);
3690     coding->src_multibyte = src_multibyte;
3691     coding->dst_multibyte = dst_multibyte;
3692     coding->heading_ascii = skip;
3693   }
3694 }
3695
3696 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3697    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3698    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3699
3700    How many non-eol characters are at the head is returned as *SKIP.  */
3701
3702 #define MAX_EOL_CHECK_COUNT 3
3703
3704 static int
3705 detect_eol_type (source, src_bytes, skip)
3706      unsigned char *source;
3707      int src_bytes, *skip;
3708 {
3709   unsigned char *src = source, *src_end = src + src_bytes;
3710   unsigned char c;
3711   int total = 0;                /* How many end-of-lines are found so far.  */
3712   int eol_type = CODING_EOL_UNDECIDED;
3713   int this_eol_type;
3714
3715   *skip = 0;
3716
3717   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3718     {
3719       c = *src++;
3720       if (c == '\n' || c == '\r')
3721         {
3722           if (*skip == 0)
3723             *skip = src - 1 - source;
3724           total++;
3725           if (c == '\n')
3726             this_eol_type = CODING_EOL_LF;
3727           else if (src >= src_end || *src != '\n')
3728             this_eol_type = CODING_EOL_CR;
3729           else
3730             this_eol_type = CODING_EOL_CRLF, src++;
3731
3732           if (eol_type == CODING_EOL_UNDECIDED)
3733             /* This is the first end-of-line.  */
3734             eol_type = this_eol_type;
3735           else if (eol_type != this_eol_type)
3736             {
3737               /* The found type is different from what found before.  */
3738               eol_type = CODING_EOL_INCONSISTENT;
3739               break;
3740             }
3741         }
3742     }
3743
3744   if (*skip == 0)
3745     *skip = src_end - source;
3746   return eol_type;
3747 }
3748
3749 /* Like detect_eol_type, but detect EOL type in 2-octet
3750    big-endian/little-endian format for coding systems utf-16-be and
3751    utf-16-le.  */
3752
3753 static int
3754 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3755      unsigned char *source;
3756      int src_bytes, *skip;
3757 {
3758   unsigned char *src = source, *src_end = src + src_bytes;
3759   unsigned int c1, c2;
3760   int total = 0;                /* How many end-of-lines are found so far.  */
3761   int eol_type = CODING_EOL_UNDECIDED;
3762   int this_eol_type;
3763   int msb, lsb;
3764
3765   if (big_endian_p)
3766     msb = 0, lsb = 1;
3767   else
3768     msb = 1, lsb = 0;
3769
3770   *skip = 0;
3771
3772   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3773     {
3774       c1 = (src[msb] << 8) | (src[lsb]);
3775       src += 2;
3776
3777       if (c1 == '\n' || c1 == '\r')
3778         {
3779           if (*skip == 0)
3780             *skip = src - 2 - source;
3781           total++;
3782           if (c1 == '\n')
3783             {
3784               this_eol_type = CODING_EOL_LF;
3785             }
3786           else
3787             {
3788               if ((src + 1) >= src_end)
3789                 {
3790                   this_eol_type = CODING_EOL_CR;
3791                 }
3792               else
3793                 {
3794                   c2 = (src[msb] << 8) | (src[lsb]);
3795                   if (c2 == '\n')
3796                     this_eol_type = CODING_EOL_CRLF, src += 2;
3797                   else
3798                     this_eol_type = CODING_EOL_CR;
3799                 }
3800             }
3801
3802           if (eol_type == CODING_EOL_UNDECIDED)
3803             /* This is the first end-of-line.  */
3804             eol_type = this_eol_type;
3805           else if (eol_type != this_eol_type)
3806             {
3807               /* The found type is different from what found before.  */
3808               eol_type = CODING_EOL_INCONSISTENT;
3809               break;
3810             }
3811         }
3812     }
3813
3814   if (*skip == 0)
3815     *skip = src_end - source;
3816   return eol_type;
3817 }
3818
3819 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3820    is encoded.  If it detects an appropriate format of end-of-line, it
3821    sets the information in *CODING.  */
3822
3823 void
3824 detect_eol (coding, src, src_bytes)
3825      struct coding_system *coding;
3826      unsigned char *src;
3827      int src_bytes;
3828 {
3829   Lisp_Object val;
3830   int skip;
3831   int eol_type;
3832
3833   switch (coding->category_idx)
3834     {
3835     case CODING_CATEGORY_IDX_UTF_16_BE:
3836       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3837       break;
3838     case CODING_CATEGORY_IDX_UTF_16_LE:
3839       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3840       break;
3841     default:
3842       eol_type = detect_eol_type (src, src_bytes, &skip);
3843       break;
3844     }
3845
3846   if (coding->heading_ascii > skip)
3847     coding->heading_ascii = skip;
3848   else
3849     skip = coding->heading_ascii;
3850
3851   if (eol_type == CODING_EOL_UNDECIDED)
3852     return;
3853   if (eol_type == CODING_EOL_INCONSISTENT)
3854     {
3855 #if 0
3856       /* This code is suppressed until we find a better way to
3857          distinguish raw text file and binary file.  */
3858
3859       /* If we have already detected that the coding is raw-text, the
3860          coding should actually be no-conversion.  */
3861       if (coding->type == coding_type_raw_text)
3862         {
3863           setup_coding_system (Qno_conversion, coding);
3864           return;
3865         }
3866       /* Else, let's decode only text code anyway.  */
3867 #endif /* 0 */
3868       eol_type = CODING_EOL_LF;
3869     }
3870
3871   val = Fget (coding->symbol, Qeol_type);
3872   if (VECTORP (val) && XVECTOR (val)->size == 3)
3873     {
3874       int src_multibyte = coding->src_multibyte;
3875       int dst_multibyte = coding->dst_multibyte;
3876
3877       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3878       coding->src_multibyte = src_multibyte;
3879       coding->dst_multibyte = dst_multibyte;
3880       coding->heading_ascii = skip;
3881     }
3882 }
3883
3884 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3885
3886 #define DECODING_BUFFER_MAG(coding)                     \
3887   (coding->type == coding_type_iso2022                  \
3888    ? 3                                                  \
3889    : (coding->type == coding_type_ccl                   \
3890       ? coding->spec.ccl.decoder.buf_magnification      \
3891       : 2))
3892
3893 /* Return maximum size (bytes) of a buffer enough for decoding
3894    SRC_BYTES of text encoded in CODING.  */
3895
3896 int
3897 decoding_buffer_size (coding, src_bytes)
3898      struct coding_system *coding;
3899      int src_bytes;
3900 {
3901   return (src_bytes * DECODING_BUFFER_MAG (coding)
3902           + CONVERSION_BUFFER_EXTRA_ROOM);
3903 }
3904
3905 /* Return maximum size (bytes) of a buffer enough for encoding
3906    SRC_BYTES of text to CODING.  */
3907
3908 int
3909 encoding_buffer_size (coding, src_bytes)
3910      struct coding_system *coding;
3911      int src_bytes;
3912 {
3913   int magnification;
3914
3915   if (coding->type == coding_type_ccl)
3916     magnification = coding->spec.ccl.encoder.buf_magnification;
3917   else if (CODING_REQUIRE_ENCODING (coding))
3918     magnification = 3;
3919   else
3920     magnification = 1;
3921
3922   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3923 }
3924
3925 /* Working buffer for code conversion.  */
3926 struct conversion_buffer
3927 {
3928   int size;                     /* size of data.  */
3929   int on_stack;                 /* 1 if allocated by alloca.  */
3930   unsigned char *data;
3931 };
3932
3933 /* Don't use alloca for allocating memory space larger than this, lest
3934    we overflow their stack.  */
3935 #define MAX_ALLOCA 16*1024
3936
3937 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
3938 #define allocate_conversion_buffer(buf, len)            \
3939   do {                                                  \
3940     if (len < MAX_ALLOCA)                               \
3941       {                                                 \
3942         buf.data = (unsigned char *) alloca (len);      \
3943         buf.on_stack = 1;                               \
3944       }                                                 \
3945     else                                                \
3946       {                                                 \
3947         buf.data = (unsigned char *) xmalloc (len);     \
3948         buf.on_stack = 0;                               \
3949       }                                                 \
3950     buf.size = len;                                     \
3951   } while (0)
3952
3953 /* Double the allocated memory for *BUF.  */
3954 static void
3955 extend_conversion_buffer (buf)
3956      struct conversion_buffer *buf;
3957 {
3958   if (buf->on_stack)
3959     {
3960       unsigned char *save = buf->data;
3961       buf->data = (unsigned char *) xmalloc (buf->size * 2);
3962       bcopy (save, buf->data, buf->size);
3963       buf->on_stack = 0;
3964     }
3965   else
3966     {
3967       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
3968     }
3969   buf->size *= 2;
3970 }
3971
3972 /* Free the allocated memory for BUF if it is not on stack.  */
3973 static void
3974 free_conversion_buffer (buf)
3975      struct conversion_buffer *buf;
3976 {
3977   if (!buf->on_stack)
3978     xfree (buf->data);
3979 }
3980
3981 int
3982 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3983      struct coding_system *coding;
3984      unsigned char *source, *destination;
3985      int src_bytes, dst_bytes, encodep;
3986 {
3987   struct ccl_program *ccl
3988     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3989   int result;
3990
3991   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3992   if (encodep)
3993     ccl->eol_type = coding->eol_type;
3994   ccl->multibyte = coding->src_multibyte;
3995   coding->produced = ccl_driver (ccl, source, destination,
3996                                  src_bytes, dst_bytes, &(coding->consumed));
3997   if (encodep)
3998     coding->produced_char = coding->produced;
3999   else
4000     {
4001       int bytes
4002         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4003       coding->produced = str_as_multibyte (destination, bytes,
4004                                            coding->produced,
4005                                            &(coding->produced_char));
4006     }
4007
4008   switch (ccl->status)
4009     {
4010     case CCL_STAT_SUSPEND_BY_SRC:
4011       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4012       break;
4013     case CCL_STAT_SUSPEND_BY_DST:
4014       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4015       break;
4016     case CCL_STAT_QUIT:
4017     case CCL_STAT_INVALID_CMD:
4018       coding->result = CODING_FINISH_INTERRUPT;
4019       break;
4020     default:
4021       coding->result = CODING_FINISH_NORMAL;
4022       break;
4023     }
4024   return coding->result;
4025 }
4026
4027 /* Decode EOL format of the text at PTR of BYTES length destructively
4028    according to CODING->eol_type.  This is called after the CCL
4029    program produced a decoded text at PTR.  If we do CRLF->LF
4030    conversion, update CODING->produced and CODING->produced_char.  */
4031
4032 static void
4033 decode_eol_post_ccl (coding, ptr, bytes)
4034      struct coding_system *coding;
4035      unsigned char *ptr;
4036      int bytes;
4037 {
4038   Lisp_Object val, saved_coding_symbol;
4039   unsigned char *pend = ptr + bytes;
4040   int dummy;
4041
4042   /* Remember the current coding system symbol.  We set it back when
4043      an inconsistent EOL is found so that `last-coding-system-used' is
4044      set to the coding system that doesn't specify EOL conversion.  */
4045   saved_coding_symbol = coding->symbol;
4046
4047   coding->spec.ccl.cr_carryover = 0;
4048   if (coding->eol_type == CODING_EOL_UNDECIDED)
4049     {
4050       /* Here, to avoid the call of setup_coding_system, we directly
4051          call detect_eol_type.  */
4052       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4053       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4054         coding->eol_type = CODING_EOL_LF;
4055       if (coding->eol_type != CODING_EOL_UNDECIDED)
4056         {
4057           val = Fget (coding->symbol, Qeol_type);
4058           if (VECTORP (val) && XVECTOR (val)->size == 3)
4059             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4060         }
4061       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4062     }
4063
4064   if (coding->eol_type == CODING_EOL_LF
4065       || coding->eol_type == CODING_EOL_UNDECIDED)
4066     {
4067       /* We have nothing to do.  */
4068       ptr = pend;
4069     }
4070   else if (coding->eol_type == CODING_EOL_CRLF)
4071     {
4072       unsigned char *pstart = ptr, *p = ptr;
4073
4074       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4075           && *(pend - 1) == '\r')
4076         {
4077           /* If the last character is CR, we can't handle it here
4078              because LF will be in the not-yet-decoded source text.
4079              Recorded that the CR is not yet processed.  */
4080           coding->spec.ccl.cr_carryover = 1;
4081           coding->produced--;
4082           coding->produced_char--;
4083           pend--;
4084         }
4085       while (ptr < pend)
4086         {
4087           if (*ptr == '\r')
4088             {
4089               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4090                 {
4091                   *p++ = '\n';
4092                   ptr += 2;
4093                 }
4094               else
4095                 {
4096                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4097                     goto undo_eol_conversion;
4098                   *p++ = *ptr++;
4099                 }
4100             }
4101           else if (*ptr == '\n'
4102                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4103             goto undo_eol_conversion;
4104           else
4105             *p++ = *ptr++;
4106           continue;
4107
4108         undo_eol_conversion:
4109           /* We have faced with inconsistent EOL format at PTR.
4110              Convert all LFs before PTR back to CRLFs.  */
4111           for (p--, ptr--; p >= pstart; p--)
4112             {
4113               if (*p == '\n')
4114                 *ptr-- = '\n', *ptr-- = '\r';
4115               else
4116                 *ptr-- = *p;
4117             }
4118           /*  If carryover is recorded, cancel it because we don't
4119               convert CRLF anymore.  */
4120           if (coding->spec.ccl.cr_carryover)
4121             {
4122               coding->spec.ccl.cr_carryover = 0;
4123               coding->produced++;
4124               coding->produced_char++;
4125               pend++;
4126             }
4127           p = ptr = pend;
4128           coding->eol_type = CODING_EOL_LF;
4129           coding->symbol = saved_coding_symbol;
4130         }
4131       if (p < pend)
4132         {
4133           /* As each two-byte sequence CRLF was converted to LF, (PEND
4134              - P) is the number of deleted characters.  */
4135           coding->produced -= pend - p;
4136           coding->produced_char -= pend - p;
4137         }
4138     }
4139   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4140     {
4141       unsigned char *p = ptr;
4142
4143       for (; ptr < pend; ptr++)
4144         {
4145           if (*ptr == '\r')
4146             *ptr = '\n';
4147           else if (*ptr == '\n'
4148                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4149             {
4150               for (; p < ptr; p++)
4151                 {
4152                   if (*p == '\n')
4153                     *p = '\r';
4154                 }
4155               ptr = pend;
4156               coding->eol_type = CODING_EOL_LF;
4157               coding->symbol = saved_coding_symbol;
4158             }
4159         }
4160     }
4161 }
4162
4163 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4164    decoding, it may detect coding system and format of end-of-line if
4165    those are not yet decided.  The source should be unibyte, the
4166    result is multibyte if CODING->dst_multibyte is nonzero, else
4167    unibyte.  */
4168
4169 int
4170 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4171      struct coding_system *coding;
4172      unsigned char *source, *destination;
4173      int src_bytes, dst_bytes;
4174 {
4175   if (coding->type == coding_type_undecided)
4176     detect_coding (coding, source, src_bytes);
4177
4178   if (coding->eol_type == CODING_EOL_UNDECIDED
4179       && coding->type != coding_type_ccl)
4180     detect_eol (coding, source, src_bytes);
4181
4182   coding->produced = coding->produced_char = 0;
4183   coding->consumed = coding->consumed_char = 0;
4184   coding->errors = 0;
4185   coding->result = CODING_FINISH_NORMAL;
4186
4187   switch (coding->type)
4188     {
4189     case coding_type_sjis:
4190       decode_coding_sjis_big5 (coding, source, destination,
4191                                src_bytes, dst_bytes, 1);
4192       break;
4193
4194     case coding_type_iso2022:
4195       decode_coding_iso2022 (coding, source, destination,
4196                              src_bytes, dst_bytes);
4197       break;
4198
4199     case coding_type_big5:
4200       decode_coding_sjis_big5 (coding, source, destination,
4201                                src_bytes, dst_bytes, 0);
4202       break;
4203
4204     case coding_type_emacs_mule:
4205       decode_coding_emacs_mule (coding, source, destination,
4206                                 src_bytes, dst_bytes);
4207       break;
4208
4209     case coding_type_ccl:
4210       if (coding->spec.ccl.cr_carryover)
4211         {
4212           /* Set the CR which is not processed by the previous call of
4213              decode_eol_post_ccl in DESTINATION.  */
4214           *destination = '\r';
4215           coding->produced++;
4216           coding->produced_char++;
4217           dst_bytes--;
4218         }
4219       ccl_coding_driver (coding, source,
4220                          destination + coding->spec.ccl.cr_carryover,
4221                          src_bytes, dst_bytes, 0);
4222       if (coding->eol_type != CODING_EOL_LF)
4223         decode_eol_post_ccl (coding, destination, coding->produced);
4224       break;
4225
4226     default:
4227       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4228     }
4229
4230   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4231       && coding->mode & CODING_MODE_LAST_BLOCK
4232       && coding->consumed == src_bytes)
4233     coding->result = CODING_FINISH_NORMAL;
4234
4235   if (coding->mode & CODING_MODE_LAST_BLOCK
4236       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4237     {
4238       unsigned char *src = source + coding->consumed;
4239       unsigned char *dst = destination + coding->produced;
4240
4241       src_bytes -= coding->consumed;
4242       coding->errors++;
4243       if (COMPOSING_P (coding))
4244         DECODE_COMPOSITION_END ('1');
4245       while (src_bytes--)
4246         {
4247           int c = *src++;
4248           dst += CHAR_STRING (c, dst);
4249           coding->produced_char++;
4250         }
4251       coding->consumed = coding->consumed_char = src - source;
4252       coding->produced = dst - destination;
4253       coding->result = CODING_FINISH_NORMAL;
4254     }
4255
4256   if (!coding->dst_multibyte)
4257     {
4258       coding->produced = str_as_unibyte (destination, coding->produced);
4259       coding->produced_char = coding->produced;
4260     }
4261
4262   return coding->result;
4263 }
4264
4265 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4266    multibyteness of the source is CODING->src_multibyte, the
4267    multibyteness of the result is always unibyte.  */
4268
4269 int
4270 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4271      struct coding_system *coding;
4272      unsigned char *source, *destination;
4273      int src_bytes, dst_bytes;
4274 {
4275   coding->produced = coding->produced_char = 0;
4276   coding->consumed = coding->consumed_char = 0;
4277   coding->errors = 0;
4278   coding->result = CODING_FINISH_NORMAL;
4279
4280   switch (coding->type)
4281     {
4282     case coding_type_sjis:
4283       encode_coding_sjis_big5 (coding, source, destination,
4284                                src_bytes, dst_bytes, 1);
4285       break;
4286
4287     case coding_type_iso2022:
4288       encode_coding_iso2022 (coding, source, destination,
4289                              src_bytes, dst_bytes);
4290       break;
4291
4292     case coding_type_big5:
4293       encode_coding_sjis_big5 (coding, source, destination,
4294                                src_bytes, dst_bytes, 0);
4295       break;
4296
4297     case coding_type_emacs_mule:
4298       encode_coding_emacs_mule (coding, source, destination,
4299                                 src_bytes, dst_bytes);
4300       break;
4301
4302     case coding_type_ccl:
4303       ccl_coding_driver (coding, source, destination,
4304                          src_bytes, dst_bytes, 1);
4305       break;
4306
4307     default:
4308       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4309     }
4310
4311   if (coding->mode & CODING_MODE_LAST_BLOCK
4312       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4313     {
4314       unsigned char *src = source + coding->consumed;
4315       unsigned char *src_end = src + src_bytes;
4316       unsigned char *dst = destination + coding->produced;
4317
4318       if (coding->type == coding_type_iso2022)
4319         ENCODE_RESET_PLANE_AND_REGISTER;
4320       if (COMPOSING_P (coding))
4321         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4322       if (coding->consumed < src_bytes)
4323         {
4324           int len = src_bytes - coding->consumed;
4325
4326           BCOPY_SHORT (source + coding->consumed, dst, len);
4327           if (coding->src_multibyte)
4328             len = str_as_unibyte (dst, len);
4329           dst += len;
4330           coding->consumed = src_bytes;
4331         }
4332       coding->produced = coding->produced_char = dst - destination;
4333       coding->result = CODING_FINISH_NORMAL;
4334     }
4335
4336   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4337       && coding->consumed == src_bytes)
4338     coding->result = CODING_FINISH_NORMAL;
4339
4340   return coding->result;
4341 }
4342
4343 /* Scan text in the region between *BEG and *END (byte positions),
4344    skip characters which we don't have to decode by coding system
4345    CODING at the head and tail, then set *BEG and *END to the region
4346    of the text we actually have to convert.  The caller should move
4347    the gap out of the region in advance if the region is from a
4348    buffer.
4349
4350    If STR is not NULL, *BEG and *END are indices into STR.  */
4351
4352 static void
4353 shrink_decoding_region (beg, end, coding, str)
4354      int *beg, *end;
4355      struct coding_system *coding;
4356      unsigned char *str;
4357 {
4358   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4359   int eol_conversion;
4360   Lisp_Object translation_table;
4361
4362   if (coding->type == coding_type_ccl
4363       || coding->type == coding_type_undecided
4364       || coding->eol_type != CODING_EOL_LF
4365       || !NILP (coding->post_read_conversion)
4366       || coding->composing != COMPOSITION_DISABLED)
4367     {
4368       /* We can't skip any data.  */
4369       return;
4370     }
4371   if (coding->type == coding_type_no_conversion
4372       || coding->type == coding_type_raw_text
4373       || coding->type == coding_type_emacs_mule)
4374     {
4375       /* We need no conversion, but don't have to skip any data here.
4376          Decoding routine handles them effectively anyway.  */
4377       return;
4378     }
4379
4380   translation_table = coding->translation_table_for_decode;
4381   if (NILP (translation_table) && !NILP (Venable_character_translation))
4382     translation_table = Vstandard_translation_table_for_decode;
4383   if (CHAR_TABLE_P (translation_table))
4384     {
4385       int i;
4386       for (i = 0; i < 128; i++)
4387         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4388           break;
4389       if (i < 128)
4390         /* Some ASCII character should be translated.  We give up
4391            shrinking.  */
4392         return;
4393     }
4394
4395   if (coding->heading_ascii >= 0)
4396     /* Detection routine has already found how much we can skip at the
4397        head.  */
4398     *beg += coding->heading_ascii;
4399
4400   if (str)
4401     {
4402       begp_orig = begp = str + *beg;
4403       endp_orig = endp = str + *end;
4404     }
4405   else
4406     {
4407       begp_orig = begp = BYTE_POS_ADDR (*beg);
4408       endp_orig = endp = begp + *end - *beg;
4409     }
4410
4411   eol_conversion = (coding->eol_type == CODING_EOL_CR
4412                     || coding->eol_type == CODING_EOL_CRLF);
4413
4414   switch (coding->type)
4415     {
4416     case coding_type_sjis:
4417     case coding_type_big5:
4418       /* We can skip all ASCII characters at the head.  */
4419       if (coding->heading_ascii < 0)
4420         {
4421           if (eol_conversion)
4422             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4423           else
4424             while (begp < endp && *begp < 0x80) begp++;
4425         }
4426       /* We can skip all ASCII characters at the tail except for the
4427          second byte of SJIS or BIG5 code.  */
4428       if (eol_conversion)
4429         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4430       else
4431         while (begp < endp && endp[-1] < 0x80) endp--;
4432       /* Do not consider LF as ascii if preceded by CR, since that
4433          confuses eol decoding. */
4434       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4435         endp++;
4436       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4437         endp++;
4438       break;
4439
4440     case coding_type_iso2022:
4441       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4442         /* We can't skip any data.  */
4443         break;
4444       if (coding->heading_ascii < 0)
4445         {
4446           /* We can skip all ASCII characters at the head except for a
4447              few control codes.  */
4448           while (begp < endp && (c = *begp) < 0x80
4449                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4450                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4451                  && (!eol_conversion || c != ISO_CODE_LF))
4452             begp++;
4453         }
4454       switch (coding->category_idx)
4455         {
4456         case CODING_CATEGORY_IDX_ISO_8_1:
4457         case CODING_CATEGORY_IDX_ISO_8_2:
4458           /* We can skip all ASCII characters at the tail.  */
4459           if (eol_conversion)
4460             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4461           else
4462             while (begp < endp && endp[-1] < 0x80) endp--;
4463           /* Do not consider LF as ascii if preceded by CR, since that
4464              confuses eol decoding. */
4465           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4466             endp++;
4467           break;
4468
4469         case CODING_CATEGORY_IDX_ISO_7:
4470         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4471           {
4472             /* We can skip all charactes at the tail except for 8-bit
4473                codes and ESC and the following 2-byte at the tail.  */
4474             unsigned char *eight_bit = NULL;
4475
4476             if (eol_conversion)
4477               while (begp < endp
4478                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4479                 {
4480                   if (!eight_bit && c & 0x80) eight_bit = endp;
4481                   endp--;
4482                 }
4483             else
4484               while (begp < endp
4485                      && (c = endp[-1]) != ISO_CODE_ESC)
4486                 {
4487                   if (!eight_bit && c & 0x80) eight_bit = endp;
4488                   endp--;
4489                 }
4490             /* Do not consider LF as ascii if preceded by CR, since that
4491                confuses eol decoding. */
4492             if (begp < endp && endp < endp_orig
4493                 && endp[-1] == '\r' && endp[0] == '\n')
4494               endp++;
4495             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4496               {
4497                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4498                   /* This is an ASCII designation sequence.  We can
4499                      surely skip the tail.  But, if we have
4500                      encountered an 8-bit code, skip only the codes
4501                      after that.  */
4502                   endp = eight_bit ? eight_bit : endp + 2;
4503                 else
4504                   /* Hmmm, we can't skip the tail.  */
4505                   endp = endp_orig;
4506               }
4507             else if (eight_bit)
4508               endp = eight_bit;
4509           }
4510         }
4511       break;
4512
4513     default:
4514       abort ();
4515     }
4516   *beg += begp - begp_orig;
4517   *end += endp - endp_orig;
4518   return;
4519 }
4520
4521 /* Like shrink_decoding_region but for encoding.  */
4522
4523 static void
4524 shrink_encoding_region (beg, end, coding, str)
4525      int *beg, *end;
4526      struct coding_system *coding;
4527      unsigned char *str;
4528 {
4529   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4530   int eol_conversion;
4531   Lisp_Object translation_table;
4532
4533   if (coding->type == coding_type_ccl
4534       || coding->eol_type == CODING_EOL_CRLF
4535       || coding->eol_type == CODING_EOL_CR
4536       || coding->cmp_data && coding->cmp_data->used > 0)
4537     {
4538       /* We can't skip any data.  */
4539       return;
4540     }
4541   if (coding->type == coding_type_no_conversion
4542       || coding->type == coding_type_raw_text
4543       || coding->type == coding_type_emacs_mule
4544       || coding->type == coding_type_undecided)
4545     {
4546       /* We need no conversion, but don't have to skip any data here.
4547          Encoding routine handles them effectively anyway.  */
4548       return;
4549     }
4550
4551   translation_table = coding->translation_table_for_encode;
4552   if (NILP (translation_table) && !NILP (Venable_character_translation))
4553     translation_table = Vstandard_translation_table_for_encode;
4554   if (CHAR_TABLE_P (translation_table))
4555     {
4556       int i;
4557       for (i = 0; i < 128; i++)
4558         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4559           break;
4560       if (i < 128)
4561         /* Some ASCII character should be tranlsated.  We give up
4562            shrinking.  */
4563         return;
4564     }
4565
4566   if (str)
4567     {
4568       begp_orig = begp = str + *beg;
4569       endp_orig = endp = str + *end;
4570     }
4571   else
4572     {
4573       begp_orig = begp = BYTE_POS_ADDR (*beg);
4574       endp_orig = endp = begp + *end - *beg;
4575     }
4576
4577   eol_conversion = (coding->eol_type == CODING_EOL_CR
4578                     || coding->eol_type == CODING_EOL_CRLF);
4579
4580   /* Here, we don't have to check coding->pre_write_conversion because
4581      the caller is expected to have handled it already.  */
4582   switch (coding->type)
4583     {
4584     case coding_type_iso2022:
4585       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4586         /* We can't skip any data.  */
4587         break;
4588       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4589         {
4590           unsigned char *bol = begp;
4591           while (begp < endp && *begp < 0x80)
4592             {
4593               begp++;
4594               if (begp[-1] == '\n')
4595                 bol = begp;
4596             }
4597           begp = bol;
4598           goto label_skip_tail;
4599         }
4600       /* fall down ... */
4601
4602     case coding_type_sjis:
4603     case coding_type_big5:
4604       /* We can skip all ASCII characters at the head and tail.  */
4605       if (eol_conversion)
4606         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4607       else
4608         while (begp < endp && *begp < 0x80) begp++;
4609     label_skip_tail:
4610       if (eol_conversion)
4611         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4612       else
4613         while (begp < endp && *(endp - 1) < 0x80) endp--;
4614       break;
4615
4616     default:
4617       abort ();
4618     }
4619
4620   *beg += begp - begp_orig;
4621   *end += endp - endp_orig;
4622   return;
4623 }
4624
4625 /* As shrinking conversion region requires some overhead, we don't try
4626    shrinking if the length of conversion region is less than this
4627    value.  */
4628 static int shrink_conversion_region_threshhold = 1024;
4629
4630 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4631   do {                                                                  \
4632     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4633       {                                                                 \
4634         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4635         else shrink_decoding_region (beg, end, coding, str);            \
4636       }                                                                 \
4637   } while (0)
4638
4639 static Lisp_Object
4640 code_convert_region_unwind (dummy)
4641      Lisp_Object dummy;
4642 {
4643   inhibit_pre_post_conversion = 0;
4644   return Qnil;
4645 }
4646
4647 /* Store information about all compositions in the range FROM and TO
4648    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4649    buffer or a string, defaults to the current buffer.  */
4650
4651 void
4652 coding_save_composition (coding, from, to, obj)
4653      struct coding_system *coding;
4654      int from, to;
4655      Lisp_Object obj;
4656 {
4657   Lisp_Object prop;
4658   int start, end;
4659
4660   if (coding->composing == COMPOSITION_DISABLED)
4661     return;
4662   if (!coding->cmp_data)
4663     coding_allocate_composition_data (coding, from);
4664   if (!find_composition (from, to, &start, &end, &prop, obj)
4665       || end > to)
4666     return;
4667   if (start < from
4668       && (!find_composition (end, to, &start, &end, &prop, obj)
4669           || end > to))
4670     return;
4671   coding->composing = COMPOSITION_NO;
4672   do
4673     {
4674       if (COMPOSITION_VALID_P (start, end, prop))
4675         {
4676           enum composition_method method = COMPOSITION_METHOD (prop);
4677           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4678               >= COMPOSITION_DATA_SIZE)
4679             coding_allocate_composition_data (coding, from);
4680           /* For relative composition, we remember start and end
4681              positions, for the other compositions, we also remember
4682              components.  */
4683           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4684           if (method != COMPOSITION_RELATIVE)
4685             {
4686               /* We must store a*/
4687               Lisp_Object val, ch;
4688
4689               val = COMPOSITION_COMPONENTS (prop);
4690               if (CONSP (val))
4691                 while (CONSP (val))
4692                   {
4693                     ch = XCAR (val), val = XCDR (val);
4694                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4695                   }
4696               else if (VECTORP (val) || STRINGP (val))
4697                 {
4698                   int len = (VECTORP (val)
4699                              ? XVECTOR (val)->size : XSTRING (val)->size);
4700                   int i;
4701                   for (i = 0; i < len; i++)
4702                     {
4703                       ch = (STRINGP (val)
4704                             ? Faref (val, make_number (i))
4705                             : XVECTOR (val)->contents[i]);
4706                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4707                     }
4708                 }
4709               else              /* INTEGERP (val) */
4710                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4711             }
4712           CODING_ADD_COMPOSITION_END (coding, end - from);
4713         }
4714       start = end;
4715     }
4716   while (start < to
4717          && find_composition (start, to, &start, &end, &prop, obj)
4718          && end <= to);
4719
4720   /* Make coding->cmp_data point to the first memory block.  */
4721   while (coding->cmp_data->prev)
4722     coding->cmp_data = coding->cmp_data->prev;
4723   coding->cmp_data_start = 0;
4724 }
4725
4726 /* Reflect the saved information about compositions to OBJ.
4727    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4728    is a buffer or a string, defaults to the current buffer.  */
4729
4730 void
4731 coding_restore_composition (coding, obj)
4732      struct coding_system *coding;
4733      Lisp_Object obj;
4734 {
4735   struct composition_data *cmp_data = coding->cmp_data;
4736
4737   if (!cmp_data)
4738     return;
4739
4740   while (cmp_data->prev)
4741     cmp_data = cmp_data->prev;
4742
4743   while (cmp_data)
4744     {
4745       int i;
4746
4747       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
4748            i += cmp_data->data[i])
4749         {
4750           int *data = cmp_data->data + i;
4751           enum composition_method method = (enum composition_method) data[3];
4752           Lisp_Object components;
4753
4754           if (method == COMPOSITION_RELATIVE)
4755             components = Qnil;
4756           else
4757             {
4758               int len = data[0] - 4, j;
4759               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4760
4761               for (j = 0; j < len; j++)
4762                 args[j] = make_number (data[4 + j]);
4763               components = (method == COMPOSITION_WITH_ALTCHARS
4764                             ? Fstring (len, args) : Fvector (len, args));
4765             }
4766           compose_text (data[1], data[2], components, Qnil, obj);
4767         }
4768       cmp_data = cmp_data->next;
4769     }
4770 }
4771
4772 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4773    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4774    coding system CODING, and return the status code of code conversion
4775    (currently, this value has no meaning).
4776
4777    How many characters (and bytes) are converted to how many
4778    characters (and bytes) are recorded in members of the structure
4779    CODING.
4780
4781    If REPLACE is nonzero, we do various things as if the original text
4782    is deleted and a new text is inserted.  See the comments in
4783    replace_range (insdel.c) to know what we are doing.
4784
4785    If REPLACE is zero, it is assumed that the source text is unibyte.
4786    Otherwize, it is assumed that the source text is multibyte.  */
4787
4788 int
4789 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4790      int from, from_byte, to, to_byte, encodep, replace;
4791      struct coding_system *coding;
4792 {
4793   int len = to - from, len_byte = to_byte - from_byte;
4794   int require, inserted, inserted_byte;
4795   int head_skip, tail_skip, total_skip = 0;
4796   Lisp_Object saved_coding_symbol;
4797   int first = 1;
4798   unsigned char *src, *dst;
4799   Lisp_Object deletion;
4800   int orig_point = PT, orig_len = len;
4801   int prev_Z;
4802   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4803
4804   coding->src_multibyte = replace && multibyte_p;
4805   coding->dst_multibyte = multibyte_p;
4806
4807   deletion = Qnil;
4808   saved_coding_symbol = Qnil;
4809
4810   if (from < PT && PT < to)
4811     {
4812       TEMP_SET_PT_BOTH (from, from_byte);
4813       orig_point = from;
4814     }
4815
4816   if (replace)
4817     {
4818       int saved_from = from;
4819       int saved_inhibit_modification_hooks;
4820
4821       prepare_to_modify_buffer (from, to, &from);
4822       if (saved_from != from)
4823         {
4824           to = from + len;
4825           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4826           len_byte = to_byte - from_byte;
4827         }
4828
4829       /* The code conversion routine can not preserve text properties
4830          for now.  So, we must remove all text properties in the
4831          region.  Here, we must suppress all modification hooks.  */
4832       saved_inhibit_modification_hooks = inhibit_modification_hooks;
4833       inhibit_modification_hooks = 1;
4834       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4835       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4836     }
4837
4838   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4839     {
4840       /* We must detect encoding of text and eol format.  */
4841
4842       if (from < GPT && to > GPT)
4843         move_gap_both (from, from_byte);
4844       if (coding->type == coding_type_undecided)
4845         {
4846           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4847           if (coding->type == coding_type_undecided)
4848             {
4849               /* It seems that the text contains only ASCII, but we
4850                  should not leave it undecided because the deeper
4851                  decoding routine (decode_coding) tries to detect the
4852                  encodings again in vain.  */
4853               coding->type = coding_type_emacs_mule;
4854               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
4855             }
4856         }
4857       if (coding->eol_type == CODING_EOL_UNDECIDED
4858           && coding->type != coding_type_ccl)
4859         {
4860           saved_coding_symbol = coding->symbol;
4861           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4862           if (coding->eol_type == CODING_EOL_UNDECIDED)
4863             coding->eol_type = CODING_EOL_LF;
4864           /* We had better recover the original eol format if we
4865              encounter an inconsitent eol format while decoding.  */
4866           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4867         }
4868     }
4869
4870   /* Now we convert the text.  */
4871
4872   /* For encoding, we must process pre-write-conversion in advance.  */
4873   if (! inhibit_pre_post_conversion
4874       && encodep
4875       && SYMBOLP (coding->pre_write_conversion)
4876       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4877     {
4878       /* The function in pre-write-conversion may put a new text in a
4879          new buffer.  */
4880       struct buffer *prev = current_buffer;
4881       Lisp_Object new;
4882       int count = specpdl_ptr - specpdl;
4883
4884       record_unwind_protect (code_convert_region_unwind, Qnil);
4885       /* We should not call any more pre-write/post-read-conversion
4886          functions while this pre-write-conversion is running.  */
4887       inhibit_pre_post_conversion = 1;
4888       call2 (coding->pre_write_conversion,
4889              make_number (from), make_number (to));
4890       inhibit_pre_post_conversion = 0;
4891       /* Discard the unwind protect.  */
4892       specpdl_ptr--;
4893
4894       if (current_buffer != prev)
4895         {
4896           len = ZV - BEGV;
4897           new = Fcurrent_buffer ();
4898           set_buffer_internal_1 (prev);
4899           del_range_2 (from, from_byte, to, to_byte, 0);
4900           TEMP_SET_PT_BOTH (from, from_byte);
4901           insert_from_buffer (XBUFFER (new), 1, len, 0);
4902           Fkill_buffer (new);
4903           if (orig_point >= to)
4904             orig_point += len - orig_len;
4905           else if (orig_point > from)
4906             orig_point = from;
4907           orig_len = len;
4908           to = from + len;
4909           from_byte = CHAR_TO_BYTE (from);
4910           to_byte = CHAR_TO_BYTE (to);
4911           len_byte = to_byte - from_byte;
4912           TEMP_SET_PT_BOTH (from, from_byte);
4913         }
4914     }
4915
4916   if (replace)
4917     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4918
4919   if (coding->composing != COMPOSITION_DISABLED)
4920     {
4921       if (encodep)
4922         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4923       else
4924         coding_allocate_composition_data (coding, from);
4925     }
4926
4927   /* Try to skip the heading and tailing ASCIIs.  */
4928   if (coding->type != coding_type_ccl)
4929     {
4930       int from_byte_orig = from_byte, to_byte_orig = to_byte;
4931
4932       if (from < GPT && GPT < to)
4933         move_gap_both (from, from_byte);
4934       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4935       if (from_byte == to_byte
4936           && (encodep || NILP (coding->post_read_conversion))
4937           && ! CODING_REQUIRE_FLUSHING (coding))
4938         {
4939           coding->produced = len_byte;
4940           coding->produced_char = len;
4941           if (!replace)
4942             /* We must record and adjust for this new text now.  */
4943             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4944           return 0;
4945         }
4946
4947       head_skip = from_byte - from_byte_orig;
4948       tail_skip = to_byte_orig - to_byte;
4949       total_skip = head_skip + tail_skip;
4950       from += head_skip;
4951       to -= tail_skip;
4952       len -= total_skip; len_byte -= total_skip;
4953     }
4954
4955   /* For converion, we must put the gap before the text in addition to
4956      making the gap larger for efficient decoding.  The required gap
4957      size starts from 2000 which is the magic number used in make_gap.
4958      But, after one batch of conversion, it will be incremented if we
4959      find that it is not enough .  */
4960   require = 2000;
4961
4962   if (GAP_SIZE  < require)
4963     make_gap (require - GAP_SIZE);
4964   move_gap_both (from, from_byte);
4965
4966   inserted = inserted_byte = 0;
4967
4968   GAP_SIZE += len_byte;
4969   ZV -= len;
4970   Z -= len;
4971   ZV_BYTE -= len_byte;
4972   Z_BYTE -= len_byte;
4973
4974   if (GPT - BEG < BEG_UNCHANGED)
4975     BEG_UNCHANGED = GPT - BEG;
4976   if (Z - GPT < END_UNCHANGED)
4977     END_UNCHANGED = Z - GPT;
4978
4979   if (!encodep && coding->src_multibyte)
4980     {
4981       /* Decoding routines expects that the source text is unibyte.
4982          We must convert 8-bit characters of multibyte form to
4983          unibyte.  */
4984       int len_byte_orig = len_byte;
4985       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4986       if (len_byte < len_byte_orig)
4987         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4988                     len_byte);
4989       coding->src_multibyte = 0;
4990     }
4991
4992   for (;;)
4993     {
4994       int result;
4995
4996       /* The buffer memory is now:
4997          +--------+converted-text+---------+-------original-text-------+---+
4998          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4999                   |<---------------------- GAP ----------------------->|  */
5000       src = GAP_END_ADDR - len_byte;
5001       dst = GPT_ADDR + inserted_byte;
5002
5003       if (encodep)
5004         result = encode_coding (coding, src, dst, len_byte, 0);
5005       else
5006         result = decode_coding (coding, src, dst, len_byte, 0);
5007
5008       /* The buffer memory is now:
5009          +--------+-------converted-text----+--+------original-text----+---+
5010          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5011                   |<---------------------- GAP ----------------------->|  */
5012
5013       inserted += coding->produced_char;
5014       inserted_byte += coding->produced;
5015       len_byte -= coding->consumed;
5016
5017       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5018         {
5019           coding_allocate_composition_data (coding, from + inserted);
5020           continue;
5021         }
5022
5023       src += coding->consumed;
5024       dst += coding->produced;
5025
5026       if (result == CODING_FINISH_NORMAL)
5027         {
5028           src += len_byte;
5029           break;
5030         }
5031       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5032         {
5033           unsigned char *pend = dst, *p = pend - inserted_byte;
5034           Lisp_Object eol_type;
5035
5036           /* Encode LFs back to the original eol format (CR or CRLF).  */
5037           if (coding->eol_type == CODING_EOL_CR)
5038             {
5039               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5040             }
5041           else
5042             {
5043               int count = 0;
5044
5045               while (p < pend) if (*p++ == '\n') count++;
5046               if (src - dst < count)
5047                 {
5048                   /* We don't have sufficient room for encoding LFs
5049                      back to CRLF.  We must record converted and
5050                      not-yet-converted text back to the buffer
5051                      content, enlarge the gap, then record them out of
5052                      the buffer contents again.  */
5053                   int add = len_byte + inserted_byte;
5054
5055                   GAP_SIZE -= add;
5056                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5057                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5058                   make_gap (count - GAP_SIZE);
5059                   GAP_SIZE += add;
5060                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5061                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5062                   /* Don't forget to update SRC, DST, and PEND.  */
5063                   src = GAP_END_ADDR - len_byte;
5064                   dst = GPT_ADDR + inserted_byte;
5065                   pend = dst;
5066                 }
5067               inserted += count;
5068               inserted_byte += count;
5069               coding->produced += count;
5070               p = dst = pend + count;
5071               while (count)
5072                 {
5073                   *--p = *--pend;
5074                   if (*p == '\n') count--, *--p = '\r';
5075                 }
5076             }
5077
5078           /* Suppress eol-format conversion in the further conversion.  */
5079           coding->eol_type = CODING_EOL_LF;
5080
5081           /* Set the coding system symbol to that for Unix-like EOL.  */
5082           eol_type = Fget (saved_coding_symbol, Qeol_type);
5083           if (VECTORP (eol_type)
5084               && XVECTOR (eol_type)->size == 3
5085               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5086             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5087           else
5088             coding->symbol = saved_coding_symbol;
5089
5090           continue;
5091         }
5092       if (len_byte <= 0)
5093         {
5094           if (coding->type != coding_type_ccl
5095               || coding->mode & CODING_MODE_LAST_BLOCK)
5096             break;
5097           coding->mode |= CODING_MODE_LAST_BLOCK;
5098           continue;
5099         }
5100       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5101         {
5102           /* The source text ends in invalid codes.  Let's just
5103              make them valid buffer contents, and finish conversion.  */
5104           inserted += len_byte;
5105           inserted_byte += len_byte;
5106           while (len_byte--)
5107             *dst++ = *src++;
5108           break;
5109         }
5110       if (result == CODING_FINISH_INTERRUPT)
5111         {
5112           /* The conversion procedure was interrupted by a user.  */
5113           break;
5114         }
5115       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5116       if (coding->consumed < 1)
5117         {
5118           /* It's quite strange to require more memory without
5119              consuming any bytes.  Perhaps CCL program bug.  */
5120           break;
5121         }
5122       if (first)
5123         {
5124           /* We have just done the first batch of conversion which was
5125              stoped because of insufficient gap.  Let's reconsider the
5126              required gap size (i.e. SRT - DST) now.
5127
5128              We have converted ORIG bytes (== coding->consumed) into
5129              NEW bytes (coding->produced).  To convert the remaining
5130              LEN bytes, we may need REQUIRE bytes of gap, where:
5131                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5132                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5133              Here, we are sure that NEW >= ORIG.  */
5134           float ratio = coding->produced - coding->consumed;
5135           ratio /= coding->consumed;
5136           require = len_byte * ratio;
5137           first = 0;
5138         }
5139       if ((src - dst) < (require + 2000))
5140         {
5141           /* See the comment above the previous call of make_gap.  */
5142           int add = len_byte + inserted_byte;
5143
5144           GAP_SIZE -= add;
5145           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5146           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5147           make_gap (require + 2000);
5148           GAP_SIZE += add;
5149           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5150           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5151         }
5152     }
5153   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5154
5155   if (encodep && coding->dst_multibyte)
5156     {
5157       /* The output is unibyte.  We must convert 8-bit characters to
5158          multibyte form.  */
5159       if (inserted_byte * 2 > GAP_SIZE)
5160         {
5161           GAP_SIZE -= inserted_byte;
5162           ZV += inserted_byte; Z += inserted_byte;
5163           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5164           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5165           make_gap (inserted_byte - GAP_SIZE);
5166           GAP_SIZE += inserted_byte;
5167           ZV -= inserted_byte; Z -= inserted_byte;
5168           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5169           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5170         }
5171       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5172     }
5173
5174   /* If we have shrinked the conversion area, adjust it now.  */
5175   if (total_skip > 0)
5176     {
5177       if (tail_skip > 0)
5178         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5179       inserted += total_skip; inserted_byte += total_skip;
5180       GAP_SIZE += total_skip;
5181       GPT -= head_skip; GPT_BYTE -= head_skip;
5182       ZV -= total_skip; ZV_BYTE -= total_skip;
5183       Z -= total_skip; Z_BYTE -= total_skip;
5184       from -= head_skip; from_byte -= head_skip;
5185       to += tail_skip; to_byte += tail_skip;
5186     }
5187
5188   prev_Z = Z;
5189   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5190   inserted = Z - prev_Z;
5191
5192   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5193     coding_restore_composition (coding, Fcurrent_buffer ());
5194   coding_free_composition_data (coding);
5195
5196   if (! inhibit_pre_post_conversion
5197       && ! encodep && ! NILP (coding->post_read_conversion))
5198     {
5199       Lisp_Object val;
5200       int count = specpdl_ptr - specpdl;
5201
5202       if (from != PT)
5203         TEMP_SET_PT_BOTH (from, from_byte);
5204       prev_Z = Z;
5205       record_unwind_protect (code_convert_region_unwind, Qnil);
5206       /* We should not call any more pre-write/post-read-conversion
5207          functions while this post-read-conversion is running.  */
5208       inhibit_pre_post_conversion = 1;
5209       val = call1 (coding->post_read_conversion, make_number (inserted));
5210       inhibit_pre_post_conversion = 0;
5211       /* Discard the unwind protect.  */
5212       specpdl_ptr--;
5213       CHECK_NUMBER (val, 0);
5214       inserted += Z - prev_Z;
5215     }
5216
5217   if (orig_point >= from)
5218     {
5219       if (orig_point >= from + orig_len)
5220         orig_point += inserted - orig_len;
5221       else
5222         orig_point = from;
5223       TEMP_SET_PT (orig_point);
5224     }
5225
5226   if (replace)
5227     {
5228       signal_after_change (from, to - from, inserted);
5229       update_compositions (from, from + inserted, CHECK_BORDER);
5230     }
5231
5232   {
5233     coding->consumed = to_byte - from_byte;
5234     coding->consumed_char = to - from;
5235     coding->produced = inserted_byte;
5236     coding->produced_char = inserted;
5237   }
5238
5239   return 0;
5240 }
5241
5242 Lisp_Object
5243 run_pre_post_conversion_on_str (str, coding, encodep)
5244      Lisp_Object str;
5245      struct coding_system *coding;
5246      int encodep;
5247 {
5248   int count = specpdl_ptr - specpdl;
5249   struct gcpro gcpro1;
5250   struct buffer *prev = current_buffer;
5251   int multibyte = STRING_MULTIBYTE (str);
5252
5253   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5254   record_unwind_protect (code_convert_region_unwind, Qnil);
5255   GCPRO1 (str);
5256   temp_output_buffer_setup (" *code-converting-work*");
5257   set_buffer_internal (XBUFFER (Vstandard_output));
5258   /* We must insert the contents of STR as is without
5259      unibyte<->multibyte conversion.  For that, we adjust the
5260      multibyteness of the working buffer to that of STR.  */
5261   Ferase_buffer ();
5262   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5263   insert_from_string (str, 0, 0,
5264                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5265   UNGCPRO;
5266   inhibit_pre_post_conversion = 1;
5267   if (encodep)
5268     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5269   else
5270     {
5271       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5272       call1 (coding->post_read_conversion, make_number (Z - BEG));
5273     }
5274   inhibit_pre_post_conversion = 0;
5275   str = make_buffer_string (BEG, Z, 1);
5276   return unbind_to (count, str);
5277 }
5278
5279 Lisp_Object
5280 decode_coding_string (str, coding, nocopy)
5281      Lisp_Object str;
5282      struct coding_system *coding;
5283      int nocopy;
5284 {
5285   int len;
5286   struct conversion_buffer buf;
5287   int from, to, to_byte;
5288   struct gcpro gcpro1;
5289   Lisp_Object saved_coding_symbol;
5290   int result;
5291   int require_decoding;
5292   int shrinked_bytes = 0;
5293   Lisp_Object newstr;
5294   int consumed, consumed_char, produced, produced_char;
5295
5296   from = 0;
5297   to = XSTRING (str)->size;
5298   to_byte = STRING_BYTES (XSTRING (str));
5299
5300   saved_coding_symbol = Qnil;
5301   if (CODING_REQUIRE_DETECTION (coding))
5302     {
5303       /* See the comments in code_convert_region.  */
5304       if (coding->type == coding_type_undecided)
5305         {
5306           detect_coding (coding, XSTRING (str)->data, to_byte);
5307           if (coding->type == coding_type_undecided)
5308             coding->type = coding_type_emacs_mule;
5309         }
5310       if (coding->eol_type == CODING_EOL_UNDECIDED
5311           && coding->type != coding_type_ccl)
5312         {
5313           saved_coding_symbol = coding->symbol;
5314           detect_eol (coding, XSTRING (str)->data, to_byte);
5315           if (coding->eol_type == CODING_EOL_UNDECIDED)
5316             coding->eol_type = CODING_EOL_LF;
5317           /* We had better recover the original eol format if we
5318              encounter an inconsitent eol format while decoding.  */
5319           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5320         }
5321     }
5322
5323   coding->src_multibyte = 0;
5324   coding->dst_multibyte = (coding->type != coding_type_no_conversion
5325                            && coding->type != coding_type_raw_text);
5326   require_decoding = CODING_REQUIRE_DECODING (coding);
5327
5328   if (STRING_MULTIBYTE (str))
5329     {
5330       /* Decoding routines expect the source text to be unibyte.  */
5331       str = Fstring_as_unibyte (str);
5332       to_byte = STRING_BYTES (XSTRING (str));
5333       nocopy = 1;
5334     }
5335
5336   /* Try to skip the heading and tailing ASCIIs.  */
5337   if (require_decoding && coding->type != coding_type_ccl)
5338     {
5339       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5340                                 0);
5341       if (from == to_byte)
5342         require_decoding = 0;
5343       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5344     }
5345
5346   if (!require_decoding)
5347     {
5348       coding->consumed = STRING_BYTES (XSTRING (str));
5349       coding->consumed_char = XSTRING (str)->size;
5350       if (coding->dst_multibyte)
5351         {
5352           str = Fstring_as_multibyte (str);
5353           nocopy = 1;
5354         }
5355       coding->produced = STRING_BYTES (XSTRING (str));
5356       coding->produced_char = XSTRING (str)->size;
5357       return (nocopy ? str : Fcopy_sequence (str));
5358     }
5359
5360   if (coding->composing != COMPOSITION_DISABLED)
5361     coding_allocate_composition_data (coding, from);
5362   len = decoding_buffer_size (coding, to_byte - from);
5363   allocate_conversion_buffer (buf, len);
5364
5365   consumed = consumed_char = produced = produced_char = 0;
5366   while (1)
5367     {
5368       result = decode_coding (coding, XSTRING (str)->data + from + consumed,
5369                               buf.data + produced, to_byte - from - consumed,
5370                               buf.size - produced);
5371       consumed += coding->consumed;
5372       consumed_char += coding->consumed_char;
5373       produced += coding->produced;
5374       produced_char += coding->produced_char;
5375       if (result == CODING_FINISH_NORMAL
5376           || (result == CODING_FINISH_INSUFFICIENT_SRC
5377               && coding->consumed == 0))
5378         break;
5379       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5380         coding_allocate_composition_data (coding, from + produced_char);
5381       else if (result == CODING_FINISH_INSUFFICIENT_DST)
5382         extend_conversion_buffer (&buf);
5383       else if (result == CODING_FINISH_INCONSISTENT_EOL)
5384         {
5385           /* Recover the original EOL format.  */
5386           if (coding->eol_type == CODING_EOL_CR)
5387             {
5388               unsigned char *p;
5389               for (p = buf.data; p < buf.data + produced; p++)
5390                 if (*p == '\n') *p = '\r';
5391             }
5392           else if (coding->eol_type == CODING_EOL_CRLF)
5393             {
5394               int num_eol = 0;
5395               unsigned char *p0, *p1;
5396               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5397                 if (*p0 == '\n') num_eol++;
5398               if (produced + num_eol >= buf.size)
5399                 extend_conversion_buffer (&buf);
5400               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5401                 {
5402                   *--p1 = *--p0;
5403                   if (*p0 == '\n') *--p1 = '\r';
5404                 }
5405               produced += num_eol;
5406               produced_char += num_eol;
5407             }
5408           coding->eol_type = CODING_EOL_LF;
5409           coding->symbol = saved_coding_symbol;
5410         }
5411     }
5412
5413   coding->consumed = consumed;
5414   coding->consumed_char = consumed_char;
5415   coding->produced = produced;
5416   coding->produced_char = produced_char;
5417
5418   if (coding->dst_multibyte)
5419     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
5420                                            produced + shrinked_bytes);
5421   else
5422     newstr = make_uninit_string (produced + shrinked_bytes);
5423   if (from > 0)
5424     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5425   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5426   if (shrinked_bytes > from)
5427     bcopy (XSTRING (str)->data + to_byte,
5428            XSTRING (newstr)->data + from + produced,
5429            shrinked_bytes - from);
5430   free_conversion_buffer (&buf);
5431
5432   if (coding->cmp_data && coding->cmp_data->used)
5433     coding_restore_composition (coding, newstr);
5434   coding_free_composition_data (coding);
5435
5436   if (SYMBOLP (coding->post_read_conversion)
5437       && !NILP (Ffboundp (coding->post_read_conversion)))
5438     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
5439
5440   return newstr;
5441 }
5442
5443 Lisp_Object
5444 encode_coding_string (str, coding, nocopy)
5445      Lisp_Object str;
5446      struct coding_system *coding;
5447      int nocopy;
5448 {
5449   int len;
5450   struct conversion_buffer buf;
5451   int from, to, to_byte;
5452   struct gcpro gcpro1;
5453   Lisp_Object saved_coding_symbol;
5454   int result;
5455   int shrinked_bytes = 0;
5456   Lisp_Object newstr;
5457   int consumed, consumed_char, produced, produced_char;
5458
5459   if (SYMBOLP (coding->pre_write_conversion)
5460       && !NILP (Ffboundp (coding->pre_write_conversion)))
5461     str = run_pre_post_conversion_on_str (str, coding, 1);
5462
5463   from = 0;
5464   to = XSTRING (str)->size;
5465   to_byte = STRING_BYTES (XSTRING (str));
5466
5467   saved_coding_symbol = Qnil;
5468
5469   /* Encoding routines determine the multibyteness of the source text
5470      by coding->src_multibyte.  */
5471   coding->src_multibyte = STRING_MULTIBYTE (str);
5472   coding->dst_multibyte = 0;
5473   if (! CODING_REQUIRE_ENCODING (coding))
5474     {
5475       coding->consumed = STRING_BYTES (XSTRING (str));
5476       coding->consumed_char = XSTRING (str)->size;
5477       if (STRING_MULTIBYTE (str))
5478         {
5479           str = Fstring_as_unibyte (str);
5480           nocopy = 1;
5481         }
5482       coding->produced = STRING_BYTES (XSTRING (str));
5483       coding->produced_char = XSTRING (str)->size;
5484       return (nocopy ? str : Fcopy_sequence (str));
5485     }
5486
5487   if (coding->composing != COMPOSITION_DISABLED)
5488     coding_save_composition (coding, from, to, str);
5489
5490   /* Try to skip the heading and tailing ASCIIs.  */
5491   if (coding->type != coding_type_ccl)
5492     {
5493       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5494                                 1);
5495       if (from == to_byte)
5496         return (nocopy ? str : Fcopy_sequence (str));
5497       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5498     }
5499
5500   len = encoding_buffer_size (coding, to_byte - from);
5501   allocate_conversion_buffer (buf, len);
5502
5503   consumed = consumed_char = produced = produced_char = 0;
5504   while (1)
5505     {
5506       result = encode_coding (coding, XSTRING (str)->data + from + consumed,
5507                               buf.data + produced, to_byte - from - consumed,
5508                               buf.size - produced);
5509       consumed += coding->consumed;
5510       consumed_char += coding->consumed_char;
5511       produced += coding->produced;
5512       produced_char += coding->produced_char;
5513       if (result == CODING_FINISH_NORMAL
5514           || (result == CODING_FINISH_INSUFFICIENT_SRC
5515               && coding->consumed == 0))
5516         break;
5517       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
5518       extend_conversion_buffer (&buf);
5519     }
5520
5521   coding->consumed = consumed;
5522   coding->consumed_char = consumed_char;
5523   coding->produced = produced;
5524   coding->produced_char = produced_char;
5525
5526   newstr = make_uninit_string (produced + shrinked_bytes);
5527   if (from > 0)
5528     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5529   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5530   if (shrinked_bytes > from)
5531     bcopy (XSTRING (str)->data + to_byte,
5532            XSTRING (newstr)->data + from + produced,
5533            shrinked_bytes - from);
5534
5535   free_conversion_buffer (&buf);
5536   coding_free_composition_data (coding);
5537
5538   return newstr;
5539 }
5540
5541 \f
5542 #ifdef emacs
5543 /*** 8. Emacs Lisp library functions ***/
5544
5545 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5546   "Return t if OBJECT is nil or a coding-system.\n\
5547 See the documentation of `make-coding-system' for information\n\
5548 about coding-system objects.")
5549   (obj)
5550      Lisp_Object obj;
5551 {
5552   if (NILP (obj))
5553     return Qt;
5554   if (!SYMBOLP (obj))
5555     return Qnil;
5556   /* Get coding-spec vector for OBJ.  */
5557   obj = Fget (obj, Qcoding_system);
5558   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5559           ? Qt : Qnil);
5560 }
5561
5562 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5563        Sread_non_nil_coding_system, 1, 1, 0,
5564   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5565   (prompt)
5566      Lisp_Object prompt;
5567 {
5568   Lisp_Object val;
5569   do
5570     {
5571       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5572                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5573     }
5574   while (XSTRING (val)->size == 0);
5575   return (Fintern (val, Qnil));
5576 }
5577
5578 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5579   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5580 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5581   (prompt, default_coding_system)
5582      Lisp_Object prompt, default_coding_system;
5583 {
5584   Lisp_Object val;
5585   if (SYMBOLP (default_coding_system))
5586     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5587   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5588                           Qt, Qnil, Qcoding_system_history,
5589                           default_coding_system, Qnil);
5590   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5591 }
5592
5593 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5594        1, 1, 0,
5595   "Check validity of CODING-SYSTEM.\n\
5596 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5597 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5598 The value of property should be a vector of length 5.")
5599   (coding_system)
5600      Lisp_Object coding_system;
5601 {
5602   CHECK_SYMBOL (coding_system, 0);
5603   if (!NILP (Fcoding_system_p (coding_system)))
5604     return coding_system;
5605   while (1)
5606     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5607 }
5608 \f
5609 Lisp_Object
5610 detect_coding_system (src, src_bytes, highest)
5611      unsigned char *src;
5612      int src_bytes, highest;
5613 {
5614   int coding_mask, eol_type;
5615   Lisp_Object val, tmp;
5616   int dummy;
5617
5618   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5619   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5620   if (eol_type == CODING_EOL_INCONSISTENT)
5621     eol_type = CODING_EOL_UNDECIDED;
5622
5623   if (!coding_mask)
5624     {
5625       val = Qundecided;
5626       if (eol_type != CODING_EOL_UNDECIDED)
5627         {
5628           Lisp_Object val2;
5629           val2 = Fget (Qundecided, Qeol_type);
5630           if (VECTORP (val2))
5631             val = XVECTOR (val2)->contents[eol_type];
5632         }
5633       return (highest ? val : Fcons (val, Qnil));
5634     }
5635
5636   /* At first, gather possible coding systems in VAL.  */
5637   val = Qnil;
5638   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5639     {
5640       Lisp_Object category_val, category_index;
5641
5642       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5643       category_val = Fsymbol_value (XCAR (tmp));
5644       if (!NILP (category_val)
5645           && NATNUMP (category_index)
5646           && (coding_mask & (1 << XFASTINT (category_index))))
5647         {
5648           val = Fcons (category_val, val);
5649           if (highest)
5650             break;
5651         }
5652     }
5653   if (!highest)
5654     val = Fnreverse (val);
5655
5656   /* Then, replace the elements with subsidiary coding systems.  */
5657   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5658     {
5659       if (eol_type != CODING_EOL_UNDECIDED
5660           && eol_type != CODING_EOL_INCONSISTENT)
5661         {
5662           Lisp_Object eol;
5663           eol = Fget (XCAR (tmp), Qeol_type);
5664           if (VECTORP (eol))
5665             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5666         }
5667     }
5668   return (highest ? XCAR (val) : val);
5669 }
5670
5671 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5672        2, 3, 0,
5673   "Detect coding system of the text in the region between START and END.\n\
5674 Return a list of possible coding systems ordered by priority.\n\
5675 \n\
5676 If only ASCII characters are found, it returns a list of single element\n\
5677 `undecided' or its subsidiary coding system according to a detected\n\
5678 end-of-line format.\n\
5679 \n\
5680 If optional argument HIGHEST is non-nil, return the coding system of\n\
5681 highest priority.")
5682   (start, end, highest)
5683      Lisp_Object start, end, highest;
5684 {
5685   int from, to;
5686   int from_byte, to_byte;
5687
5688   CHECK_NUMBER_COERCE_MARKER (start, 0);
5689   CHECK_NUMBER_COERCE_MARKER (end, 1);
5690
5691   validate_region (&start, &end);
5692   from = XINT (start), to = XINT (end);
5693   from_byte = CHAR_TO_BYTE (from);
5694   to_byte = CHAR_TO_BYTE (to);
5695
5696   if (from < GPT && to >= GPT)
5697     move_gap_both (to, to_byte);
5698
5699   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5700                                to_byte - from_byte,
5701                                !NILP (highest));
5702 }
5703
5704 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5705        1, 2, 0,
5706   "Detect coding system of the text in STRING.\n\
5707 Return a list of possible coding systems ordered by priority.\n\
5708 \n\
5709 If only ASCII characters are found, it returns a list of single element\n\
5710 `undecided' or its subsidiary coding system according to a detected\n\
5711 end-of-line format.\n\
5712 \n\
5713 If optional argument HIGHEST is non-nil, return the coding system of\n\
5714 highest priority.")
5715   (string, highest)
5716      Lisp_Object string, highest;
5717 {
5718   CHECK_STRING (string, 0);
5719
5720   return detect_coding_system (XSTRING (string)->data,
5721                                STRING_BYTES (XSTRING (string)),
5722                                !NILP (highest));
5723 }
5724
5725 /* Return an intersection of lists L1 and L2.  */
5726
5727 static Lisp_Object
5728 intersection (l1, l2)
5729      Lisp_Object l1, l2;
5730 {
5731   Lisp_Object val;
5732
5733   for (val = Qnil; CONSP (l1); l1 = XCDR (l1))
5734     {
5735       if (!NILP (Fmemq (XCAR (l1), l2)))
5736         val = Fcons (XCAR (l1), val);
5737     }
5738   return val;
5739 }
5740
5741
5742 /*  Subroutine for Fsafe_coding_systems_region_internal.
5743
5744     Return a list of coding systems that safely encode the multibyte
5745     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
5746     possible coding systems.  If it is nil, it means that we have not
5747     yet found any coding systems.
5748
5749     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
5750     element of WORK_TABLE is set to t once the element is looked up.
5751
5752     If a non-ASCII single byte char is found, set
5753     *single_byte_char_found to 1.  */
5754
5755 static Lisp_Object
5756 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
5757      unsigned char *p, *pend;
5758      Lisp_Object safe_codings, work_table;
5759      int *single_byte_char_found;
5760 {
5761   int c, len, idx;
5762   Lisp_Object val;
5763
5764   while (p < pend)
5765     {
5766       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
5767       p += len;
5768       if (ASCII_BYTE_P (c))
5769         /* We can ignore ASCII characters here.  */
5770         continue;
5771       if (SINGLE_BYTE_CHAR_P (c))
5772         *single_byte_char_found = 1;
5773       if (NILP (safe_codings))
5774         continue;
5775       /* Check the safe coding systems for C.  */
5776       val = char_table_ref_and_index (work_table, c, &idx);
5777       if (EQ (val, Qt))
5778         /* This element was already checked.  Ignore it.  */
5779         continue;
5780       /* Remember that we checked this element.  */
5781       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
5782
5783       /* If there are some safe coding systems for C and we have
5784          already found the other set of coding systems for the
5785          different characters, get the intersection of them.  */
5786       if (!EQ (safe_codings, Qt) && !NILP (val))
5787         val = intersection (safe_codings, val);
5788       safe_codings = val;
5789     }
5790   return safe_codings;
5791 }
5792
5793
5794 /* Return a list of coding systems that safely encode the text between
5795    START and END.  If the text contains only ASCII or is unibyte,
5796    return t.  */
5797
5798 DEFUN ("find-coding-systems-region-internal",
5799        Ffind_coding_systems_region_internal,
5800        Sfind_coding_systems_region_internal, 2, 2, 0,
5801   "Internal use only.")
5802   (start, end)
5803      Lisp_Object start, end;
5804 {
5805   Lisp_Object work_table, safe_codings;
5806   int non_ascii_p = 0;
5807   int single_byte_char_found = 0;
5808   unsigned char *p1, *p1end, *p2, *p2end, *p;
5809   Lisp_Object args[2];
5810
5811   if (STRINGP (start))
5812     {
5813       if (!STRING_MULTIBYTE (start))
5814         return Qt;
5815       p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start));
5816       p2 = p2end = p1end;
5817       if (XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
5818         non_ascii_p = 1;
5819     }
5820   else
5821     {
5822       int from, to, stop;
5823
5824       CHECK_NUMBER_COERCE_MARKER (start, 0);
5825       CHECK_NUMBER_COERCE_MARKER (end, 1);
5826       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
5827         args_out_of_range (start, end);
5828       if (NILP (current_buffer->enable_multibyte_characters))
5829         return Qt;
5830       from = CHAR_TO_BYTE (XINT (start));
5831       to = CHAR_TO_BYTE (XINT (end));
5832       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
5833       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
5834       if (stop == to)
5835         p2 = p2end = p1end;
5836       else
5837         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
5838       if (XINT (end) - XINT (start) != to - from)
5839         non_ascii_p = 1;
5840     }
5841
5842   if (!non_ascii_p)
5843     {
5844       /* We are sure that the text contains no multibyte character.
5845          Check if it contains eight-bit-graphic.  */
5846       p = p1;
5847       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
5848       if (p == p1end)
5849         {
5850           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
5851           if (p == p2end)
5852             return Qt;
5853         }
5854     }
5855
5856   /* The text contains non-ASCII characters.  */
5857   work_table = Fcopy_sequence (Vchar_coding_system_table);
5858   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
5859                                     &single_byte_char_found);
5860   if (p2 < p2end)
5861     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
5862                                       &single_byte_char_found);
5863
5864   if (!single_byte_char_found)
5865     {
5866       /* Append generic coding systems.  */
5867       Lisp_Object args[2];
5868       args[0] = safe_codings;
5869       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
5870                                         make_number (0));
5871       safe_codings = Fappend (2, args);
5872     }
5873   else
5874     safe_codings = Fcons (Qraw_text,
5875                           Fcons (Qemacs_mule,
5876                                  Fcons (Qno_conversion, safe_codings)));
5877   return safe_codings;
5878 }
5879
5880
5881 Lisp_Object
5882 code_convert_region1 (start, end, coding_system, encodep)
5883      Lisp_Object start, end, coding_system;
5884      int encodep;
5885 {
5886   struct coding_system coding;
5887   int from, to, len;
5888
5889   CHECK_NUMBER_COERCE_MARKER (start, 0);
5890   CHECK_NUMBER_COERCE_MARKER (end, 1);
5891   CHECK_SYMBOL (coding_system, 2);
5892
5893   validate_region (&start, &end);
5894   from = XFASTINT (start);
5895   to = XFASTINT (end);
5896
5897   if (NILP (coding_system))
5898     return make_number (to - from);
5899
5900   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5901     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5902
5903   coding.mode |= CODING_MODE_LAST_BLOCK;
5904   coding.src_multibyte = coding.dst_multibyte
5905     = !NILP (current_buffer->enable_multibyte_characters);
5906   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5907                        &coding, encodep, 1);
5908   Vlast_coding_system_used = coding.symbol;
5909   return make_number (coding.produced_char);
5910 }
5911
5912 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5913        3, 3, "r\nzCoding system: ",
5914   "Decode the current region by specified coding system.\n\
5915 When called from a program, takes three arguments:\n\
5916 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5917 This function sets `last-coding-system-used' to the precise coding system\n\
5918 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5919 not fully specified.)\n\
5920 It returns the length of the decoded text.")
5921   (start, end, coding_system)
5922      Lisp_Object start, end, coding_system;
5923 {
5924   return code_convert_region1 (start, end, coding_system, 0);
5925 }
5926
5927 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5928        3, 3, "r\nzCoding system: ",
5929   "Encode the current region by specified coding system.\n\
5930 When called from a program, takes three arguments:\n\
5931 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5932 This function sets `last-coding-system-used' to the precise coding system\n\
5933 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5934 not fully specified.)\n\
5935 It returns the length of the encoded text.")
5936   (start, end, coding_system)
5937      Lisp_Object start, end, coding_system;
5938 {
5939   return code_convert_region1 (start, end, coding_system, 1);
5940 }
5941
5942 Lisp_Object
5943 code_convert_string1 (string, coding_system, nocopy, encodep)
5944      Lisp_Object string, coding_system, nocopy;
5945      int encodep;
5946 {
5947   struct coding_system coding;
5948
5949   CHECK_STRING (string, 0);
5950   CHECK_SYMBOL (coding_system, 1);
5951
5952   if (NILP (coding_system))
5953     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5954
5955   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5956     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5957
5958   coding.mode |= CODING_MODE_LAST_BLOCK;
5959   string = (encodep
5960             ? encode_coding_string (string, &coding, !NILP (nocopy))
5961             : decode_coding_string (string, &coding, !NILP (nocopy)));
5962   Vlast_coding_system_used = coding.symbol;
5963
5964   return string;
5965 }
5966
5967 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5968        2, 3, 0,
5969   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5970 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5971 if the decoding operation is trivial.\n\
5972 This function sets `last-coding-system-used' to the precise coding system\n\
5973 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5974 not fully specified.)")
5975   (string, coding_system, nocopy)
5976      Lisp_Object string, coding_system, nocopy;
5977 {
5978   return code_convert_string1 (string, coding_system, nocopy, 0);
5979 }
5980
5981 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5982        2, 3, 0,
5983   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5984 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5985 if the encoding operation is trivial.\n\
5986 This function sets `last-coding-system-used' to the precise coding system\n\
5987 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5988 not fully specified.)")
5989   (string, coding_system, nocopy)
5990      Lisp_Object string, coding_system, nocopy;
5991 {
5992   return code_convert_string1 (string, coding_system, nocopy, 1);
5993 }
5994
5995 /* Encode or decode STRING according to CODING_SYSTEM.
5996    Do not set Vlast_coding_system_used.
5997
5998    This function is called only from macros DECODE_FILE and
5999    ENCODE_FILE, thus we ignore character composition.  */
6000
6001 Lisp_Object
6002 code_convert_string_norecord (string, coding_system, encodep)
6003      Lisp_Object string, coding_system;
6004      int encodep;
6005 {
6006   struct coding_system coding;
6007
6008   CHECK_STRING (string, 0);
6009   CHECK_SYMBOL (coding_system, 1);
6010
6011   if (NILP (coding_system))
6012     return string;
6013
6014   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6015     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
6016
6017   coding.composing = COMPOSITION_DISABLED;
6018   coding.mode |= CODING_MODE_LAST_BLOCK;
6019   return (encodep
6020           ? encode_coding_string (string, &coding, 1)
6021           : decode_coding_string (string, &coding, 1));
6022 }
6023 \f
6024 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6025   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
6026 Return the corresponding character.")
6027   (code)
6028      Lisp_Object code;
6029 {
6030   unsigned char c1, c2, s1, s2;
6031   Lisp_Object val;
6032
6033   CHECK_NUMBER (code, 0);
6034   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6035   if (s1 == 0)
6036     {
6037       if (s2 < 0x80)
6038         XSETFASTINT (val, s2);
6039       else if (s2 >= 0xA0 || s2 <= 0xDF)
6040         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6041       else
6042         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6043     }
6044   else
6045     {
6046       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
6047           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6048         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6049       DECODE_SJIS (s1, s2, c1, c2);
6050       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6051     }
6052   return val;
6053 }
6054
6055 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6056   "Encode a Japanese character CHAR to shift_jis encoding.\n\
6057 Return the corresponding code in SJIS.")
6058   (ch)
6059      Lisp_Object ch;
6060 {
6061   int charset, c1, c2, s1, s2;
6062   Lisp_Object val;
6063
6064   CHECK_NUMBER (ch, 0);
6065   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6066   if (charset == CHARSET_ASCII)
6067     {
6068       val = ch;
6069     }
6070   else if (charset == charset_jisx0208
6071            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6072     {
6073       ENCODE_SJIS (c1, c2, s1, s2);
6074       XSETFASTINT (val, (s1 << 8) | s2);
6075     }
6076   else if (charset == charset_katakana_jisx0201
6077            && c1 > 0x20 && c2 < 0xE0)
6078     {
6079       XSETFASTINT (val, c1 | 0x80);
6080     }
6081   else
6082     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6083   return val;
6084 }
6085
6086 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6087   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
6088 Return the corresponding character.")
6089   (code)
6090      Lisp_Object code;
6091 {
6092   int charset;
6093   unsigned char b1, b2, c1, c2;
6094   Lisp_Object val;
6095
6096   CHECK_NUMBER (code, 0);
6097   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6098   if (b1 == 0)
6099     {
6100       if (b2 >= 0x80)
6101         error ("Invalid BIG5 code: %x", XFASTINT (code));
6102       val = code;
6103     }
6104   else
6105     {
6106       if ((b1 < 0xA1 || b1 > 0xFE)
6107           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6108         error ("Invalid BIG5 code: %x", XFASTINT (code));
6109       DECODE_BIG5 (b1, b2, charset, c1, c2);
6110       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6111     }
6112   return val;
6113 }
6114
6115 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6116   "Encode the Big5 character CHAR to BIG5 coding system.\n\
6117 Return the corresponding character code in Big5.")
6118   (ch)
6119      Lisp_Object ch;
6120 {
6121   int charset, c1, c2, b1, b2;
6122   Lisp_Object val;
6123
6124   CHECK_NUMBER (ch, 0);
6125   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6126   if (charset == CHARSET_ASCII)
6127     {
6128       val = ch;
6129     }
6130   else if ((charset == charset_big5_1
6131             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6132            || (charset == charset_big5_2
6133                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6134     {
6135       ENCODE_BIG5 (charset, c1, c2, b1, b2);
6136       XSETFASTINT (val, (b1 << 8) | b2);
6137     }
6138   else
6139     error ("Can't encode to Big5: %d", XFASTINT (ch));
6140   return val;
6141 }
6142 \f
6143 DEFUN ("set-terminal-coding-system-internal",
6144        Fset_terminal_coding_system_internal,
6145        Sset_terminal_coding_system_internal, 1, 1, 0, "")
6146   (coding_system)
6147      Lisp_Object coding_system;
6148 {
6149   CHECK_SYMBOL (coding_system, 0);
6150   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6151   /* We had better not send unsafe characters to terminal.  */
6152   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6153   /* Characer composition should be disabled.  */
6154   terminal_coding.composing = COMPOSITION_DISABLED;
6155   terminal_coding.src_multibyte = 1;
6156   terminal_coding.dst_multibyte = 0;
6157   return Qnil;
6158 }
6159
6160 DEFUN ("set-safe-terminal-coding-system-internal",
6161        Fset_safe_terminal_coding_system_internal,
6162        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
6163   (coding_system)
6164      Lisp_Object coding_system;
6165 {
6166   CHECK_SYMBOL (coding_system, 0);
6167   setup_coding_system (Fcheck_coding_system (coding_system),
6168                        &safe_terminal_coding);
6169   /* Characer composition should be disabled.  */
6170   safe_terminal_coding.composing = COMPOSITION_DISABLED;
6171   safe_terminal_coding.src_multibyte = 1;
6172   safe_terminal_coding.dst_multibyte = 0;
6173   return Qnil;
6174 }
6175
6176 DEFUN ("terminal-coding-system",
6177        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6178   "Return coding system specified for terminal output.")
6179   ()
6180 {
6181   return terminal_coding.symbol;
6182 }
6183
6184 DEFUN ("set-keyboard-coding-system-internal",
6185        Fset_keyboard_coding_system_internal,
6186        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
6187   (coding_system)
6188      Lisp_Object coding_system;
6189 {
6190   CHECK_SYMBOL (coding_system, 0);
6191   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6192   /* Characer composition should be disabled.  */
6193   keyboard_coding.composing = COMPOSITION_DISABLED;
6194   return Qnil;
6195 }
6196
6197 DEFUN ("keyboard-coding-system",
6198        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6199   "Return coding system specified for decoding keyboard input.")
6200   ()
6201 {
6202   return keyboard_coding.symbol;
6203 }
6204
6205 \f
6206 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6207        Sfind_operation_coding_system,  1, MANY, 0,
6208   "Choose a coding system for an operation based on the target name.\n\
6209 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
6210 DECODING-SYSTEM is the coding system to use for decoding\n\
6211 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
6212 for encoding (in case OPERATION does encoding).\n\
6213 \n\
6214 The first argument OPERATION specifies an I/O primitive:\n\
6215   For file I/O, `insert-file-contents' or `write-region'.\n\
6216   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
6217   For network I/O, `open-network-stream'.\n\
6218 \n\
6219 The remaining arguments should be the same arguments that were passed\n\
6220 to the primitive.  Depending on which primitive, one of those arguments\n\
6221 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
6222 whichever argument specifies the file name is TARGET.\n\
6223 \n\
6224 TARGET has a meaning which depends on OPERATION:\n\
6225   For file I/O, TARGET is a file name.\n\
6226   For process I/O, TARGET is a process name.\n\
6227   For network I/O, TARGET is a service name or a port number\n\
6228 \n\
6229 This function looks up what specified for TARGET in,\n\
6230 `file-coding-system-alist', `process-coding-system-alist',\n\
6231 or `network-coding-system-alist' depending on OPERATION.\n\
6232 They may specify a coding system, a cons of coding systems,\n\
6233 or a function symbol to call.\n\
6234 In the last case, we call the function with one argument,\n\
6235 which is a list of all the arguments given to this function.")
6236   (nargs, args)
6237      int nargs;
6238      Lisp_Object *args;
6239 {
6240   Lisp_Object operation, target_idx, target, val;
6241   register Lisp_Object chain;
6242
6243   if (nargs < 2)
6244     error ("Too few arguments");
6245   operation = args[0];
6246   if (!SYMBOLP (operation)
6247       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
6248     error ("Invalid first arguement");
6249   if (nargs < 1 + XINT (target_idx))
6250     error ("Too few arguments for operation: %s",
6251            XSYMBOL (operation)->name->data);
6252   target = args[XINT (target_idx) + 1];
6253   if (!(STRINGP (target)
6254         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
6255     error ("Invalid %dth argument", XINT (target_idx) + 1);
6256
6257   chain = ((EQ (operation, Qinsert_file_contents)
6258             || EQ (operation, Qwrite_region))
6259            ? Vfile_coding_system_alist
6260            : (EQ (operation, Qopen_network_stream)
6261               ? Vnetwork_coding_system_alist
6262               : Vprocess_coding_system_alist));
6263   if (NILP (chain))
6264     return Qnil;
6265
6266   for (; CONSP (chain); chain = XCDR (chain))
6267     {
6268       Lisp_Object elt;
6269       elt = XCAR (chain);
6270
6271       if (CONSP (elt)
6272           && ((STRINGP (target)
6273                && STRINGP (XCAR (elt))
6274                && fast_string_match (XCAR (elt), target) >= 0)
6275               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6276         {
6277           val = XCDR (elt);
6278           /* Here, if VAL is both a valid coding system and a valid
6279              function symbol, we return VAL as a coding system.  */
6280           if (CONSP (val))
6281             return val;
6282           if (! SYMBOLP (val))
6283             return Qnil;
6284           if (! NILP (Fcoding_system_p (val)))
6285             return Fcons (val, val);
6286           if (! NILP (Ffboundp (val)))
6287             {
6288               val = call1 (val, Flist (nargs, args));
6289               if (CONSP (val))
6290                 return val;
6291               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
6292                 return Fcons (val, val);
6293             }
6294           return Qnil;
6295         }
6296     }
6297   return Qnil;
6298 }
6299
6300 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
6301        Supdate_coding_systems_internal, 0, 0, 0,
6302   "Update internal database for ISO2022 and CCL based coding systems.\n\
6303 When values of any coding categories are changed, you must\n\
6304 call this function")
6305   ()
6306 {
6307   int i;
6308
6309   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
6310     {
6311       Lisp_Object val;
6312
6313       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
6314       if (!NILP (val))
6315         {
6316           if (! coding_system_table[i])
6317             coding_system_table[i] = ((struct coding_system *)
6318                                       xmalloc (sizeof (struct coding_system)));
6319           setup_coding_system (val, coding_system_table[i]);
6320         }
6321       else if (coding_system_table[i])
6322         {
6323           xfree (coding_system_table[i]);
6324           coding_system_table[i] = NULL;
6325         }
6326     }
6327
6328   return Qnil;
6329 }
6330
6331 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
6332        Sset_coding_priority_internal, 0, 0, 0,
6333   "Update internal database for the current value of `coding-category-list'.\n\
6334 This function is internal use only.")
6335   ()
6336 {
6337   int i = 0, idx;
6338   Lisp_Object val;
6339
6340   val = Vcoding_category_list;
6341
6342   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
6343     {
6344       if (! SYMBOLP (XCAR (val)))
6345         break;
6346       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6347       if (idx >= CODING_CATEGORY_IDX_MAX)
6348         break;
6349       coding_priorities[i++] = (1 << idx);
6350       val = XCDR (val);
6351     }
6352   /* If coding-category-list is valid and contains all coding
6353      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
6354      the following code saves Emacs from crashing.  */
6355   while (i < CODING_CATEGORY_IDX_MAX)
6356     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6357
6358   return Qnil;
6359 }
6360
6361 #endif /* emacs */
6362
6363 \f
6364 /*** 9. Post-amble ***/
6365
6366 void
6367 init_coding_once ()
6368 {
6369   int i;
6370
6371   /* Emacs' internal format specific initialize routine.  */
6372   for (i = 0; i <= 0x20; i++)
6373     emacs_code_class[i] = EMACS_control_code;
6374   emacs_code_class[0x0A] = EMACS_linefeed_code;
6375   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6376   for (i = 0x21 ; i < 0x7F; i++)
6377     emacs_code_class[i] = EMACS_ascii_code;
6378   emacs_code_class[0x7F] = EMACS_control_code;
6379   for (i = 0x80; i < 0xFF; i++)
6380     emacs_code_class[i] = EMACS_invalid_code;
6381   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6382   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6383   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6384   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6385
6386   /* ISO2022 specific initialize routine.  */
6387   for (i = 0; i < 0x20; i++)
6388     iso_code_class[i] = ISO_control_0;
6389   for (i = 0x21; i < 0x7F; i++)
6390     iso_code_class[i] = ISO_graphic_plane_0;
6391   for (i = 0x80; i < 0xA0; i++)
6392     iso_code_class[i] = ISO_control_1;
6393   for (i = 0xA1; i < 0xFF; i++)
6394     iso_code_class[i] = ISO_graphic_plane_1;
6395   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6396   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6397   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6398   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6399   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6400   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6401   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6402   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6403   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6404   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6405
6406   setup_coding_system (Qnil, &keyboard_coding);
6407   setup_coding_system (Qnil, &terminal_coding);
6408   setup_coding_system (Qnil, &safe_terminal_coding);
6409   setup_coding_system (Qnil, &default_buffer_file_coding);
6410
6411   bzero (coding_system_table, sizeof coding_system_table);
6412
6413   bzero (ascii_skip_code, sizeof ascii_skip_code);
6414   for (i = 0; i < 128; i++)
6415     ascii_skip_code[i] = 1;
6416
6417 #if defined (MSDOS) || defined (WINDOWSNT)
6418   system_eol_type = CODING_EOL_CRLF;
6419 #else
6420   system_eol_type = CODING_EOL_LF;
6421 #endif
6422
6423   inhibit_pre_post_conversion = 0;
6424 }
6425
6426 #ifdef emacs
6427
6428 void
6429 syms_of_coding ()
6430 {
6431   Qtarget_idx = intern ("target-idx");
6432   staticpro (&Qtarget_idx);
6433
6434   Qcoding_system_history = intern ("coding-system-history");
6435   staticpro (&Qcoding_system_history);
6436   Fset (Qcoding_system_history, Qnil);
6437
6438   /* Target FILENAME is the first argument.  */
6439   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6440   /* Target FILENAME is the third argument.  */
6441   Fput (Qwrite_region, Qtarget_idx, make_number (2));
6442
6443   Qcall_process = intern ("call-process");
6444   staticpro (&Qcall_process);
6445   /* Target PROGRAM is the first argument.  */
6446   Fput (Qcall_process, Qtarget_idx, make_number (0));
6447
6448   Qcall_process_region = intern ("call-process-region");
6449   staticpro (&Qcall_process_region);
6450   /* Target PROGRAM is the third argument.  */
6451   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6452
6453   Qstart_process = intern ("start-process");
6454   staticpro (&Qstart_process);
6455   /* Target PROGRAM is the third argument.  */
6456   Fput (Qstart_process, Qtarget_idx, make_number (2));
6457
6458   Qopen_network_stream = intern ("open-network-stream");
6459   staticpro (&Qopen_network_stream);
6460   /* Target SERVICE is the fourth argument.  */
6461   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6462
6463   Qcoding_system = intern ("coding-system");
6464   staticpro (&Qcoding_system);
6465
6466   Qeol_type = intern ("eol-type");
6467   staticpro (&Qeol_type);
6468
6469   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6470   staticpro (&Qbuffer_file_coding_system);
6471
6472   Qpost_read_conversion = intern ("post-read-conversion");
6473   staticpro (&Qpost_read_conversion);
6474
6475   Qpre_write_conversion = intern ("pre-write-conversion");
6476   staticpro (&Qpre_write_conversion);
6477
6478   Qno_conversion = intern ("no-conversion");
6479   staticpro (&Qno_conversion);
6480
6481   Qundecided = intern ("undecided");
6482   staticpro (&Qundecided);
6483
6484   Qcoding_system_p = intern ("coding-system-p");
6485   staticpro (&Qcoding_system_p);
6486
6487   Qcoding_system_error = intern ("coding-system-error");
6488   staticpro (&Qcoding_system_error);
6489
6490   Fput (Qcoding_system_error, Qerror_conditions,
6491         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6492   Fput (Qcoding_system_error, Qerror_message,
6493         build_string ("Invalid coding system"));
6494
6495   Qcoding_category = intern ("coding-category");
6496   staticpro (&Qcoding_category);
6497   Qcoding_category_index = intern ("coding-category-index");
6498   staticpro (&Qcoding_category_index);
6499
6500   Vcoding_category_table
6501     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6502   staticpro (&Vcoding_category_table);
6503   {
6504     int i;
6505     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6506       {
6507         XVECTOR (Vcoding_category_table)->contents[i]
6508           = intern (coding_category_name[i]);
6509         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6510               Qcoding_category_index, make_number (i));
6511       }
6512   }
6513
6514   Qtranslation_table = intern ("translation-table");
6515   staticpro (&Qtranslation_table);
6516   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6517
6518   Qtranslation_table_id = intern ("translation-table-id");
6519   staticpro (&Qtranslation_table_id);
6520
6521   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6522   staticpro (&Qtranslation_table_for_decode);
6523
6524   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6525   staticpro (&Qtranslation_table_for_encode);
6526
6527   Qsafe_chars = intern ("safe-chars");
6528   staticpro (&Qsafe_chars);
6529
6530   Qchar_coding_system = intern ("char-coding-system");
6531   staticpro (&Qchar_coding_system);
6532
6533   /* Intern this now in case it isn't already done.
6534      Setting this variable twice is harmless.
6535      But don't staticpro it here--that is done in alloc.c.  */
6536   Qchar_table_extra_slots = intern ("char-table-extra-slots");
6537   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
6538   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (1));
6539
6540   Qvalid_codes = intern ("valid-codes");
6541   staticpro (&Qvalid_codes);
6542
6543   Qemacs_mule = intern ("emacs-mule");
6544   staticpro (&Qemacs_mule);
6545
6546   Qraw_text = intern ("raw-text");
6547   staticpro (&Qraw_text);
6548
6549   defsubr (&Scoding_system_p);
6550   defsubr (&Sread_coding_system);
6551   defsubr (&Sread_non_nil_coding_system);
6552   defsubr (&Scheck_coding_system);
6553   defsubr (&Sdetect_coding_region);
6554   defsubr (&Sdetect_coding_string);
6555   defsubr (&Sfind_coding_systems_region_internal);
6556   defsubr (&Sdecode_coding_region);
6557   defsubr (&Sencode_coding_region);
6558   defsubr (&Sdecode_coding_string);
6559   defsubr (&Sencode_coding_string);
6560   defsubr (&Sdecode_sjis_char);
6561   defsubr (&Sencode_sjis_char);
6562   defsubr (&Sdecode_big5_char);
6563   defsubr (&Sencode_big5_char);
6564   defsubr (&Sset_terminal_coding_system_internal);
6565   defsubr (&Sset_safe_terminal_coding_system_internal);
6566   defsubr (&Sterminal_coding_system);
6567   defsubr (&Sset_keyboard_coding_system_internal);
6568   defsubr (&Skeyboard_coding_system);
6569   defsubr (&Sfind_operation_coding_system);
6570   defsubr (&Supdate_coding_systems_internal);
6571   defsubr (&Sset_coding_priority_internal);
6572
6573   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6574     "List of coding systems.\n\
6575 \n\
6576 Do not alter the value of this variable manually.  This variable should be\n\
6577 updated by the functions `make-coding-system' and\n\
6578 `define-coding-system-alias'.");
6579   Vcoding_system_list = Qnil;
6580
6581   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6582     "Alist of coding system names.\n\
6583 Each element is one element list of coding system name.\n\
6584 This variable is given to `completing-read' as TABLE argument.\n\
6585 \n\
6586 Do not alter the value of this variable manually.  This variable should be\n\
6587 updated by the functions `make-coding-system' and\n\
6588 `define-coding-system-alias'.");
6589   Vcoding_system_alist = Qnil;
6590
6591   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6592     "List of coding-categories (symbols) ordered by priority.");
6593   {
6594     int i;
6595
6596     Vcoding_category_list = Qnil;
6597     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6598       Vcoding_category_list
6599         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6600                  Vcoding_category_list);
6601   }
6602
6603   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6604     "Specify the coding system for read operations.\n\
6605 It is useful to bind this variable with `let', but do not set it globally.\n\
6606 If the value is a coding system, it is used for decoding on read operation.\n\
6607 If not, an appropriate element is used from one of the coding system alists:\n\
6608 There are three such tables, `file-coding-system-alist',\n\
6609 `process-coding-system-alist', and `network-coding-system-alist'.");
6610   Vcoding_system_for_read = Qnil;
6611
6612   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6613     "Specify the coding system for write operations.\n\
6614 Programs bind this variable with `let', but you should not set it globally.\n\
6615 If the value is a coding system, it is used for encoding of output,\n\
6616 when writing it to a file and when sending it to a file or subprocess.\n\
6617 \n\
6618 If this does not specify a coding system, an appropriate element\n\
6619 is used from one of the coding system alists:\n\
6620 There are three such tables, `file-coding-system-alist',\n\
6621 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6622 For output to files, if the above procedure does not specify a coding system,\n\
6623 the value of `buffer-file-coding-system' is used.");
6624   Vcoding_system_for_write = Qnil;
6625
6626   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6627     "Coding system used in the latest file or process I/O.");
6628   Vlast_coding_system_used = Qnil;
6629
6630   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6631     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6632 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6633 such conversion.");
6634   inhibit_eol_conversion = 0;
6635
6636   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6637     "Non-nil means process buffer inherits coding system of process output.\n\
6638 Bind it to t if the process output is to be treated as if it were a file\n\
6639 read from some filesystem.");
6640   inherit_process_coding_system = 0;
6641
6642   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6643     "Alist to decide a coding system to use for a file I/O operation.\n\
6644 The format is ((PATTERN . VAL) ...),\n\
6645 where PATTERN is a regular expression matching a file name,\n\
6646 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6647 If VAL is a coding system, it is used for both decoding and encoding\n\
6648 the file contents.\n\
6649 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6650 and the cdr part is used for encoding.\n\
6651 If VAL is a function symbol, the function must return a coding system\n\
6652 or a cons of coding systems which are used as above.\n\
6653 \n\
6654 See also the function `find-operation-coding-system'\n\
6655 and the variable `auto-coding-alist'.");
6656   Vfile_coding_system_alist = Qnil;
6657
6658   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6659     "Alist to decide a coding system to use for a process I/O operation.\n\
6660 The format is ((PATTERN . VAL) ...),\n\
6661 where PATTERN is a regular expression matching a program name,\n\
6662 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6663 If VAL is a coding system, it is used for both decoding what received\n\
6664 from the program and encoding what sent to the program.\n\
6665 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6666 and the cdr part is used for encoding.\n\
6667 If VAL is a function symbol, the function must return a coding system\n\
6668 or a cons of coding systems which are used as above.\n\
6669 \n\
6670 See also the function `find-operation-coding-system'.");
6671   Vprocess_coding_system_alist = Qnil;
6672
6673   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6674     "Alist to decide a coding system to use for a network I/O operation.\n\
6675 The format is ((PATTERN . VAL) ...),\n\
6676 where PATTERN is a regular expression matching a network service name\n\
6677 or is a port number to connect to,\n\
6678 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6679 If VAL is a coding system, it is used for both decoding what received\n\
6680 from the network stream and encoding what sent to the network stream.\n\
6681 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6682 and the cdr part is used for encoding.\n\
6683 If VAL is a function symbol, the function must return a coding system\n\
6684 or a cons of coding systems which are used as above.\n\
6685 \n\
6686 See also the function `find-operation-coding-system'.");
6687   Vnetwork_coding_system_alist = Qnil;
6688
6689   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6690     "Coding system to use with system messages.");
6691   Vlocale_coding_system = Qnil;
6692
6693   /* The eol mnemonics are reset in startup.el system-dependently.  */
6694   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6695     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6696   eol_mnemonic_unix = build_string (":");
6697
6698   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6699     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6700   eol_mnemonic_dos = build_string ("\\");
6701
6702   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6703     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6704   eol_mnemonic_mac = build_string ("/");
6705
6706   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6707     "*String displayed in mode line when end-of-line format is not yet determined.");
6708   eol_mnemonic_undecided = build_string (":");
6709
6710   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6711     "*Non-nil enables character translation while encoding and decoding.");
6712   Venable_character_translation = Qt;
6713
6714   DEFVAR_LISP ("standard-translation-table-for-decode",
6715     &Vstandard_translation_table_for_decode,
6716     "Table for translating characters while decoding.");
6717   Vstandard_translation_table_for_decode = Qnil;
6718
6719   DEFVAR_LISP ("standard-translation-table-for-encode",
6720     &Vstandard_translation_table_for_encode,
6721     "Table for translationg characters while encoding.");
6722   Vstandard_translation_table_for_encode = Qnil;
6723
6724   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6725     "Alist of charsets vs revision numbers.\n\
6726 While encoding, if a charset (car part of an element) is found,\n\
6727 designate it with the escape sequence identifing revision (cdr part of the element).");
6728   Vcharset_revision_alist = Qnil;
6729
6730   DEFVAR_LISP ("default-process-coding-system",
6731                &Vdefault_process_coding_system,
6732     "Cons of coding systems used for process I/O by default.\n\
6733 The car part is used for decoding a process output,\n\
6734 the cdr part is used for encoding a text to be sent to a process.");
6735   Vdefault_process_coding_system = Qnil;
6736
6737   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6738     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6739 This is a vector of length 256.\n\
6740 If Nth element is non-nil, the existence of code N in a file\n\
6741 \(or output of subprocess) doesn't prevent it to be detected as\n\
6742 a coding system of ISO 2022 variant which has a flag\n\
6743 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6744 or reading output of a subprocess.\n\
6745 Only 128th through 159th elements has a meaning.");
6746   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6747
6748   DEFVAR_LISP ("select-safe-coding-system-function",
6749                &Vselect_safe_coding_system_function,
6750     "Function to call to select safe coding system for encoding a text.\n\
6751 \n\
6752 If set, this function is called to force a user to select a proper\n\
6753 coding system which can encode the text in the case that a default\n\
6754 coding system used in each operation can't encode the text.\n\
6755 \n\
6756 The default value is `select-safe-coding-system' (which see).");
6757   Vselect_safe_coding_system_function = Qnil;
6758
6759   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
6760     "Char-table containing safe coding systems of each characters.\n\
6761 Each element doesn't include such generic coding systems that can\n\
6762 encode any characters.   They are in the first extra slot.");
6763   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
6764
6765   DEFVAR_BOOL ("inhibit-iso-escape-detection",
6766                &inhibit_iso_escape_detection,
6767     "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\
6768 \n\
6769 By default, on reading a file, Emacs tries to detect how the text is\n\
6770 encoded.  This code detection is sensitive to escape sequences.  If\n\
6771 the sequence is valid as ISO2022, the code is determined as one of\n\
6772 the ISO2022 encodings, and the file is decoded by the corresponding\n\
6773 coding system (e.g. `iso-2022-7bit').\n\
6774 \n\
6775 However, there may be a case that you want to read escape sequences in\n\
6776 a file as is.  In such a case, you can set this variable to non-nil.\n\
6777 Then, as the code detection ignores any escape sequences, no file is\n\
6778 detected as encoded in some ISO2022 encoding.  The result is that all\n\
6779 escape sequences become visible in a buffer.\n\
6780 \n\
6781 The default value is nil, and it is strongly recommended not to change\n\
6782 it.  That is because many Emacs Lisp source files that contain\n\
6783 non-ASCII characters are encoded by the coding system `iso-2022-7bit'\n\
6784 in Emacs's distribution, and they won't be decoded correctly on\n\
6785 reading if you suppress escape sequence detection.\n\
6786 \n\
6787 The other way to read escape sequences in a file without decoding is\n\
6788 to explicitly specify some coding system that doesn't use ISO2022's\n\
6789 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].");
6790   inhibit_iso_escape_detection = 0;
6791 }
6792
6793 char *
6794 emacs_strerror (error_number)
6795      int error_number;
6796 {
6797   char *str;
6798
6799   synchronize_system_messages_locale ();
6800   str = strerror (error_number);
6801
6802   if (! NILP (Vlocale_coding_system))
6803     {
6804       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6805                                                       Vlocale_coding_system,
6806                                                       0);
6807       str = (char *) XSTRING (dec)->data;
6808     }
6809
6810   return str;
6811 }
6812
6813 #endif /* emacs */
6814