src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  */
 116 #if 0
 117 int
 118 detect_coding_emacs_mule (src, src_end)
 119      unsigned char *src, *src_end;
 120 {
 121   ...
 122 }
 123 #endif
 124
 125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 126
 127   These functions decode SRC_BYTES length of unibyte text at SOURCE
 128   encoded in CODING to Emacs' internal format.  The resulting
 129   multibyte text goes to a place pointed to by DESTINATION, the length
 130   of which should not exceed DST_BYTES.
 131
 132   These functions set the information of original and decoded texts in
 133   the members produced, produced_char, consumed, and consumed_char of
 134   the structure *CODING.  They also set the member result to one of
 135   CODING_FINISH_XXX indicating how the decoding finished.
 136
 137   DST_BYTES zero means that source area and destination area are
 138   overlapped, which means that we can produce a decoded text until it
 139   reaches at the head of not-yet-decoded source text.
 140
 141   Below is a template of these functions.  */
 142 #if 0
 143 static void
 144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148 {
 149   ...
 150 }
 151 #endif
 152
 153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 154
 155   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 156   internal multibyte format to CODING.  The resulting unibyte text
 157   goes to a place pointed to by DESTINATION, the length of which
 158   should not exceed DST_BYTES.
 159
 160   These functions set the information of original and encoded texts in
 161   the members produced, produced_char, consumed, and consumed_char of
 162   the structure *CODING.  They also set the member result to one of
 163   CODING_FINISH_XXX indicating how the encoding finished.
 164
 165   DST_BYTES zero means that source area and destination area are
 166   overlapped, which means that we can produce a encoded text until it
 167   reaches at the head of not-yet-encoded source text.
 168
 169   Below is a template of these functions.  */
 170 #if 0
 171 static void
 172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 173      struct coding_system *coding;
 174      unsigned char *source, *destination;
 175      int src_bytes, dst_bytes;
 176 {
 177   ...
 178 }
 179 #endif
 180
 181 /*** COMMONLY USED MACROS ***/
 182
 183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 184    get one, two, and three bytes from the source text respectively.
 185    If there are not enough bytes in the source, they jump to
 186    `label_end_of_loop'.  The caller should set variables `coding',
 187    `src' and `src_end' to appropriate pointer in advance.  These
 188    macros are called from decoding routines `decode_coding_XXX', thus
 189    it is assumed that the source text is unibyte.  */
 190
 191 #define ONE_MORE_BYTE(c1)                                       \
 192   do {                                                          \
 193     if (src >= src_end)                                         \
 194       {                                                         \
 195         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 196         goto label_end_of_loop;                                 \
 197       }                                                         \
 198     c1 = *src++;                                                \
 199   } while (0)
 200
 201 #define TWO_MORE_BYTES(c1, c2)                                  \
 202   do {                                                          \
 203     if (src + 1 >= src_end)                                     \
 204       {                                                         \
 205         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 206         goto label_end_of_loop;                                 \
 207       }                                                         \
 208     c1 = *src++;                                                \
 209     c2 = *src++;                                                \
 210   } while (0)
 211
 212
 213 /* Set C to the next character at the source text pointed by `src'.
 214    If there are not enough characters in the source, jump to
 215    `label_end_of_loop'.  The caller should set variables `coding'
 216    `src', `src_end', and `translation_table' to appropriate pointers
 217    in advance.  This macro is used in encoding routines
 218    `encode_coding_XXX', thus it assumes that the source text is in
 219    multibyte form except for 8-bit characters.  8-bit characters are
 220    in multibyte form if coding->src_multibyte is nonzero, else they
 221    are represented by a single byte.  */
 222
 223 #define ONE_MORE_CHAR(c)                                        \
 224   do {                                                          \
 225     int len = src_end - src;                                    \
 226     int bytes;                                                  \
 227     if (len <= 0)                                               \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         goto label_end_of_loop;                                 \
 231       }                                                         \
 232     if (coding->src_multibyte                                   \
 233         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 234       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 235     else                                                        \
 236       c = *src, bytes = 1;                                      \
 237     if (!NILP (translation_table))                              \
 238       c = translate_char (translation_table, c, 0, 0, 0);       \
 239     src += bytes;                                               \
 240   } while (0)
 241
 242
 243 /* Produce a multibyte form of characater C to `dst'.  Jump to
 244    `label_end_of_loop' if there's not enough space at `dst'.
 245
 246    If we are now in the middle of composition sequence, the decoded
 247    character may be ALTCHAR (for the current composition).  In that
 248    case, the character goes to coding->cmp_data->data instead of
 249    `dst'.
 250
 251    This macro is used in decoding routines.  */
 252
 253 #define EMIT_CHAR(c)                                                    \
 254   do {                                                                  \
 255     if (! COMPOSING_P (coding)                                          \
 256         || coding->composing == COMPOSITION_RELATIVE                    \
 257         || coding->composing == COMPOSITION_WITH_RULE)                  \
 258       {                                                                 \
 259         int bytes = CHAR_BYTES (c);                                     \
 260         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 261           {                                                             \
 262             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 263             goto label_end_of_loop;                                     \
 264           }                                                             \
 265         dst += CHAR_STRING (c, dst);                                    \
 266         coding->produced_char++;                                        \
 267       }                                                                 \
 268                                                                         \
 269     if (COMPOSING_P (coding)                                            \
 270         && coding->composing != COMPOSITION_RELATIVE)                   \
 271       {                                                                 \
 272         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 273         coding->composition_rule_follows                                \
 274           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 275       }                                                                 \
 276   } while (0)
 277
 278
 279 #define EMIT_ONE_BYTE(c)                                        \
 280   do {                                                          \
 281     if (dst >= (dst_bytes ? dst_end : src))                     \
 282       {                                                         \
 283         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 284         goto label_end_of_loop;                                 \
 285       }                                                         \
 286     *dst++ = c;                                                 \
 287   } while (0)
 288
 289 #define EMIT_TWO_BYTES(c1, c2)                                  \
 290   do {                                                          \
 291     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 292       {                                                         \
 293         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 294         goto label_end_of_loop;                                 \
 295       }                                                         \
 296     *dst++ = c1, *dst++ = c2;                                   \
 297   } while (0)
 298
 299 #define EMIT_BYTES(from, to)                                    \
 300   do {                                                          \
 301     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     while (from < to)                                           \
 307       *dst++ = *from++;                                         \
 308   } while (0)
 309
 310 \f
 311 /*** 1. Preamble ***/
 312
 313 #ifdef emacs
 314 #include <config.h>
 315 #endif
 316
 317 #include <stdio.h>
 318
 319 #ifdef emacs
 320
 321 #include "lisp.h"
 322 #include "buffer.h"
 323 #include "charset.h"
 324 #include "composite.h"
 325 #include "ccl.h"
 326 #include "coding.h"
 327 #include "window.h"
 328
 329 #else  /* not emacs */
 330
 331 #include "mulelib.h"
 332
 333 #endif /* not emacs */
 334
 335 Lisp_Object Qcoding_system, Qeol_type;
 336 Lisp_Object Qbuffer_file_coding_system;
 337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 338 Lisp_Object Qno_conversion, Qundecided;
 339 Lisp_Object Qcoding_system_history;
 340 Lisp_Object Qsafe_charsets;
 341 Lisp_Object Qvalid_codes;
 342
 343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 345 Lisp_Object Qstart_process, Qopen_network_stream;
 346 Lisp_Object Qtarget_idx;
 347
 348 Lisp_Object Vselect_safe_coding_system_function;
 349
 350 /* Mnemonic string for each format of end-of-line.  */
 351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 352 /* Mnemonic string to indicate format of end-of-line is not yet
 353    decided.  */
 354 Lisp_Object eol_mnemonic_undecided;
 355
 356 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 357    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 358 int system_eol_type;
 359
 360 #ifdef emacs
 361
 362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 363
 364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 365
 366 /* Coding system emacs-mule and raw-text are for converting only
 367    end-of-line format.  */
 368 Lisp_Object Qemacs_mule, Qraw_text;
 369
 370 /* Coding-systems are handed between Emacs Lisp programs and C internal
 371    routines by the following three variables.  */
 372 /* Coding-system for reading files and receiving data from process.  */
 373 Lisp_Object Vcoding_system_for_read;
 374 /* Coding-system for writing files and sending data to process.  */
 375 Lisp_Object Vcoding_system_for_write;
 376 /* Coding-system actually used in the latest I/O.  */
 377 Lisp_Object Vlast_coding_system_used;
 378
 379 /* A vector of length 256 which contains information about special
 380    Latin codes (especially for dealing with Microsoft codes).  */
 381 Lisp_Object Vlatin_extra_code_table;
 382
 383 /* Flag to inhibit code conversion of end-of-line format.  */
 384 int inhibit_eol_conversion;
 385
 386 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 387 int inherit_process_coding_system;
 388
 389 /* Coding system to be used to encode text for terminal display.  */
 390 struct coding_system terminal_coding;
 391
 392 /* Coding system to be used to encode text for terminal display when
 393    terminal coding system is nil.  */
 394 struct coding_system safe_terminal_coding;
 395
 396 /* Coding system of what is sent from terminal keyboard.  */
 397 struct coding_system keyboard_coding;
 398
 399 /* Default coding system to be used to write a file.  */
 400 struct coding_system default_buffer_file_coding;
 401
 402 Lisp_Object Vfile_coding_system_alist;
 403 Lisp_Object Vprocess_coding_system_alist;
 404 Lisp_Object Vnetwork_coding_system_alist;
 405
 406 Lisp_Object Vlocale_coding_system;
 407
 408 #endif /* emacs */
 409
 410 Lisp_Object Qcoding_category, Qcoding_category_index;
 411
 412 /* List of symbols `coding-category-xxx' ordered by priority.  */
 413 Lisp_Object Vcoding_category_list;
 414
 415 /* Table of coding categories (Lisp symbols).  */
 416 Lisp_Object Vcoding_category_table;
 417
 418 /* Table of names of symbol for each coding-category.  */
 419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 420   "coding-category-emacs-mule",
 421   "coding-category-sjis",
 422   "coding-category-iso-7",
 423   "coding-category-iso-7-tight",
 424   "coding-category-iso-8-1",
 425   "coding-category-iso-8-2",
 426   "coding-category-iso-7-else",
 427   "coding-category-iso-8-else",
 428   "coding-category-ccl",
 429   "coding-category-big5",
 430   "coding-category-utf-8",
 431   "coding-category-utf-16-be",
 432   "coding-category-utf-16-le",
 433   "coding-category-raw-text",
 434   "coding-category-binary"
 435 };
 436
 437 /* Table of pointers to coding systems corresponding to each coding
 438    categories.  */
 439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 440
 441 /* Table of coding category masks.  Nth element is a mask for a coding
 442    cateogry of which priority is Nth.  */
 443 static
 444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 445
 446 /* Flag to tell if we look up translation table on character code
 447    conversion.  */
 448 Lisp_Object Venable_character_translation;
 449 /* Standard translation table to look up on decoding (reading).  */
 450 Lisp_Object Vstandard_translation_table_for_decode;
 451 /* Standard translation table to look up on encoding (writing).  */
 452 Lisp_Object Vstandard_translation_table_for_encode;
 453
 454 Lisp_Object Qtranslation_table;
 455 Lisp_Object Qtranslation_table_id;
 456 Lisp_Object Qtranslation_table_for_decode;
 457 Lisp_Object Qtranslation_table_for_encode;
 458
 459 /* Alist of charsets vs revision number.  */
 460 Lisp_Object Vcharset_revision_alist;
 461
 462 /* Default coding systems used for process I/O.  */
 463 Lisp_Object Vdefault_process_coding_system;
 464
 465 /* Global flag to tell that we can't call post-read-conversion and
 466    pre-write-conversion functions.  Usually the value is zero, but it
 467    is set to 1 temporarily while such functions are running.  This is
 468    to avoid infinite recursive call.  */
 469 static int inhibit_pre_post_conversion;
 470
 471 \f
 472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 473
 474 /* Emacs' internal format for encoding multiple character sets is a
 475    kind of multi-byte encoding, i.e. characters are encoded by
 476    variable-length sequences of one-byte codes.
 477
 478    ASCII characters and control characters (e.g. `tab', `newline') are
 479    represented by one-byte sequences which are their ASCII codes, in
 480    the range 0x00 through 0x7F.
 481
 482    8-bit characters of the range 0x80..0x9F are represented by
 483    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 484    code + 0x20).
 485
 486    8-bit characters of the range 0xA0..0xFF are represented by
 487    one-byte sequences which are their 8-bit code.
 488
 489    The other characters are represented by a sequence of `base
 490    leading-code', optional `extended leading-code', and one or two
 491    `position-code's.  The length of the sequence is determined by the
 492    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 493    whereas extended leading-code and position-code take the range 0xA0
 494    through 0xFF.  See `charset.h' for more details about leading-code
 495    and position-code.
 496
 497    --- CODE RANGE of Emacs' internal format ---
 498    character set        range
 499    -------------        -----
 500    ascii                0x00..0x7F
 501    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 502    eight-bit-graphic    0xA0..0xBF
 503    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 504    ---------------------------------------------
 505
 506   */
 507
 508 enum emacs_code_class_type emacs_code_class[256];
 509
 510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 511    Check if a text is encoded in Emacs' internal format.  If it is,
 512    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 513
 514 int
 515 detect_coding_emacs_mule (src, src_end)
 516       unsigned char *src, *src_end;
 517 {
 518   unsigned char c;
 519   int composing = 0;
 520   /* Dummy for ONE_MORE_BYTE.  */
 521   struct coding_system dummy_coding;
 522   struct coding_system *coding = &dummy_coding;
 523
 524   while (1)
 525     {
 526       ONE_MORE_BYTE (c);
 527
 528       if (composing)
 529         {
 530           if (c < 0xA0)
 531             composing = 0;
 532           else if (c == 0xA0)
 533             {
 534               ONE_MORE_BYTE (c);
 535               c &= 0x7F;
 536             }
 537           else
 538             c -= 0x20;
 539         }
 540
 541       if (c < 0x20)
 542         {
 543           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 544             return 0;
 545         }
 546       else if (c >= 0x80 && c < 0xA0)
 547         {
 548           if (c == 0x80)
 549             /* Old leading code for a composite character.  */
 550             composing = 1;
 551           else
 552             {
 553               unsigned char *src_base = src - 1;
 554               int bytes;
 555
 556               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 557                                                bytes))
 558                 return 0;
 559               src = src_base + bytes;
 560             }
 561         }
 562     }
 563  label_end_of_loop:
 564   return CODING_CATEGORY_MASK_EMACS_MULE;
 565 }
 566
 567
 568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 569
 570 static void
 571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 572      struct coding_system *coding;
 573      unsigned char *source, *destination;
 574      int src_bytes, dst_bytes;
 575 {
 576   unsigned char *src = source;
 577   unsigned char *src_end = source + src_bytes;
 578   unsigned char *dst = destination;
 579   unsigned char *dst_end = destination + dst_bytes;
 580   /* SRC_BASE remembers the start position in source in each loop.
 581      The loop will be exited when there's not enough source code, or
 582      when there's not enough destination area to produce a
 583      character.  */
 584   unsigned char *src_base;
 585
 586   coding->produced_char = 0;
 587   while (src < src_end)
 588     {
 589       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 590       int bytes;
 591
 592       src_base = src;
 593       if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 594         {
 595           p = src;
 596           src += bytes;
 597         }
 598       else
 599         {
 600           bytes = CHAR_STRING (*src, tmp);
 601           p = tmp;
 602           src++;
 603         }
 604       if (dst + bytes >= (dst_bytes ? dst_end : src))
 605         {
 606           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 607           break;
 608         }
 609       while (bytes--) *dst++ = *p++;
 610       coding->produced_char++;
 611     }
 612   coding->consumed = coding->consumed_char = src_base - source;
 613   coding->produced = dst - destination;
 614 }
 615
 616 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 617   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 618
 619
 620 \f
 621 /*** 3. ISO2022 handlers ***/
 622
 623 /* The following note describes the coding system ISO2022 briefly.
 624    Since the intention of this note is to help understand the
 625    functions in this file, some parts are NOT ACCURATE or OVERLY
 626    SIMPLIFIED.  For thorough understanding, please refer to the
 627    original document of ISO2022.
 628
 629    ISO2022 provides many mechanisms to encode several character sets
 630    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 631    is encoded using bytes less than 128.  This may make the encoded
 632    text a little bit longer, but the text passes more easily through
 633    several gateways, some of which strip off MSB (Most Signigant Bit).
 634
 635    There are two kinds of character sets: control character set and
 636    graphic character set.  The former contains control characters such
 637    as `newline' and `escape' to provide control functions (control
 638    functions are also provided by escape sequences).  The latter
 639    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 640    two control character sets and many graphic character sets.
 641
 642    Graphic character sets are classified into one of the following
 643    four classes, according to the number of bytes (DIMENSION) and
 644    number of characters in one dimension (CHARS) of the set:
 645    - DIMENSION1_CHARS94
 646    - DIMENSION1_CHARS96
 647    - DIMENSION2_CHARS94
 648    - DIMENSION2_CHARS96
 649
 650    In addition, each character set is assigned an identification tag,
 651    unique for each set, called "final character" (denoted as <F>
 652    hereafter).  The <F> of each character set is decided by ECMA(*)
 653    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 654    (0x30..0x3F are for private use only).
 655
 656    Note (*): ECMA = European Computer Manufacturers Association
 657
 658    Here are examples of graphic character set [NAME(<F>)]:
 659         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 660         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 661         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 662         o DIMENSION2_CHARS96 -- none for the moment
 663
 664    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 665         C0 [0x00..0x1F] -- control character plane 0
 666         GL [0x20..0x7F] -- graphic character plane 0
 667         C1 [0x80..0x9F] -- control character plane 1
 668         GR [0xA0..0xFF] -- graphic character plane 1
 669
 670    A control character set is directly designated and invoked to C0 or
 671    C1 by an escape sequence.  The most common case is that:
 672    - ISO646's  control character set is designated/invoked to C0, and
 673    - ISO6429's control character set is designated/invoked to C1,
 674    and usually these designations/invocations are omitted in encoded
 675    text.  In a 7-bit environment, only C0 can be used, and a control
 676    character for C1 is encoded by an appropriate escape sequence to
 677    fit into the environment.  All control characters for C1 are
 678    defined to have corresponding escape sequences.
 679
 680    A graphic character set is at first designated to one of four
 681    graphic registers (G0 through G3), then these graphic registers are
 682    invoked to GL or GR.  These designations and invocations can be
 683    done independently.  The most common case is that G0 is invoked to
 684    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 685    these invocations and designations are omitted in encoded text.
 686    In a 7-bit environment, only GL can be used.
 687
 688    When a graphic character set of CHARS94 is invoked to GL, codes
 689    0x20 and 0x7F of the GL area work as control characters SPACE and
 690    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 691    be used.
 692
 693    There are two ways of invocation: locking-shift and single-shift.
 694    With locking-shift, the invocation lasts until the next different
 695    invocation, whereas with single-shift, the invocation affects the
 696    following character only and doesn't affect the locking-shift
 697    state.  Invocations are done by the following control characters or
 698    escape sequences:
 699
 700    ----------------------------------------------------------------------
 701    abbrev  function                  cntrl escape seq   description
 702    ----------------------------------------------------------------------
 703    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 704    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 705    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 706    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 707    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 708    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 709    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 710    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 711    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 712    ----------------------------------------------------------------------
 713    (*) These are not used by any known coding system.
 714
 715    Control characters for these functions are defined by macros
 716    ISO_CODE_XXX in `coding.h'.
 717
 718    Designations are done by the following escape sequences:
 719    ----------------------------------------------------------------------
 720    escape sequence      description
 721    ----------------------------------------------------------------------
 722    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 723    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 724    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 725    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 726    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 727    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 728    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 729    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 730    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 731    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 732    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 733    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 734    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 735    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 736    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 737    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 738    ----------------------------------------------------------------------
 739
 740    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 741    of dimension 1, chars 94, and final character <F>, etc...
 742
 743    Note (*): Although these designations are not allowed in ISO2022,
 744    Emacs accepts them on decoding, and produces them on encoding
 745    CHARS96 character sets in a coding system which is characterized as
 746    7-bit environment, non-locking-shift, and non-single-shift.
 747
 748    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 749    '(' can be omitted.  We refer to this as "short-form" hereafter.
 750
 751    Now you may notice that there are a lot of ways for encoding the
 752    same multilingual text in ISO2022.  Actually, there exist many
 753    coding systems such as Compound Text (used in X11's inter client
 754    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 755    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 756    localized platforms), and all of these are variants of ISO2022.
 757
 758    In addition to the above, Emacs handles two more kinds of escape
 759    sequences: ISO6429's direction specification and Emacs' private
 760    sequence for specifying character composition.
 761
 762    ISO6429's direction specification takes the following form:
 763         o CSI ']'      -- end of the current direction
 764         o CSI '0' ']'  -- end of the current direction
 765         o CSI '1' ']'  -- start of left-to-right text
 766         o CSI '2' ']'  -- start of right-to-left text
 767    The control character CSI (0x9B: control sequence introducer) is
 768    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 769
 770    Character composition specification takes the following form:
 771         o ESC '0' -- start relative composition
 772         o ESC '1' -- end composition
 773         o ESC '2' -- start rule-base composition (*)
 774         o ESC '3' -- start relative composition with alternate chars  (**)
 775         o ESC '4' -- start rule-base composition with alternate chars  (**)
 776   Since these are not standard escape sequences of any ISO standard,
 777   the use of them for these meaning is restricted to Emacs only.
 778
 779   (*) This form is used only in Emacs 20.5 and the older versions,
 780   but the newer versions can safely decode it.
 781   (**) This form is used only in Emacs 21.1 and the newer versions,
 782   and the older versions can't decode it.
 783
 784   Here's a list of examples usages of these composition escape
 785   sequences (categorized by `enum composition_method').
 786
 787   COMPOSITION_RELATIVE:
 788         ESC 0 CHAR [ CHAR ] ESC 1
 789   COMPOSITOIN_WITH_RULE:
 790         ESC 2 CHAR [ RULE CHAR ] ESC 1
 791   COMPOSITION_WITH_ALTCHARS:
 792         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 793   COMPOSITION_WITH_RULE_ALTCHARS:
 794         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 795
 796 enum iso_code_class_type iso_code_class[256];
 797
 798 #define CHARSET_OK(idx, charset)                                \
 799   (coding_system_table[idx]                                     \
 800    && (coding_system_table[idx]->safe_charsets[charset]         \
 801        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 802             (coding_system_table[idx], charset)                 \
 803            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 804
 805 #define SHIFT_OUT_OK(idx) \
 806   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 807
 808 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 809    Check if a text is encoded in ISO2022.  If it is, returns an
 810    integer in which appropriate flag bits any of:
 811         CODING_CATEGORY_MASK_ISO_7
 812         CODING_CATEGORY_MASK_ISO_7_TIGHT
 813         CODING_CATEGORY_MASK_ISO_8_1
 814         CODING_CATEGORY_MASK_ISO_8_2
 815         CODING_CATEGORY_MASK_ISO_7_ELSE
 816         CODING_CATEGORY_MASK_ISO_8_ELSE
 817    are set.  If a code which should never appear in ISO2022 is found,
 818    returns 0.  */
 819
 820 int
 821 detect_coding_iso2022 (src, src_end)
 822      unsigned char *src, *src_end;
 823 {
 824   int mask = CODING_CATEGORY_MASK_ISO;
 825   int mask_found = 0;
 826   int reg[4], shift_out = 0, single_shifting = 0;
 827   int c, c1, i, charset;
 828   /* Dummy for ONE_MORE_BYTE.  */
 829   struct coding_system dummy_coding;
 830   struct coding_system *coding = &dummy_coding;
 831
 832   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 833   while (mask && src < src_end)
 834     {
 835       ONE_MORE_BYTE (c);
 836       switch (c)
 837         {
 838         case ISO_CODE_ESC:
 839           single_shifting = 0;
 840           ONE_MORE_BYTE (c);
 841           if (c >= '(' && c <= '/')
 842             {
 843               /* Designation sequence for a charset of dimension 1.  */
 844               ONE_MORE_BYTE (c1);
 845               if (c1 < ' ' || c1 >= 0x80
 846                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 847                 /* Invalid designation sequence.  Just ignore.  */
 848                 break;
 849               reg[(c - '(') % 4] = charset;
 850             }
 851           else if (c == '$')
 852             {
 853               /* Designation sequence for a charset of dimension 2.  */
 854               ONE_MORE_BYTE (c);
 855               if (c >= '@' && c <= 'B')
 856                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 857                 reg[0] = charset = iso_charset_table[1][0][c];
 858               else if (c >= '(' && c <= '/')
 859                 {
 860                   ONE_MORE_BYTE (c1);
 861                   if (c1 < ' ' || c1 >= 0x80
 862                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 863                     /* Invalid designation sequence.  Just ignore.  */
 864                     break;
 865                   reg[(c - '(') % 4] = charset;
 866                 }
 867               else
 868                 /* Invalid designation sequence.  Just ignore.  */
 869                 break;
 870             }
 871           else if (c == 'N' || c == 'O')
 872             {
 873               /* ESC <Fe> for SS2 or SS3.  */
 874               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 875               break;
 876             }
 877           else if (c >= '0' && c <= '4')
 878             {
 879               /* ESC <Fp> for start/end composition.  */
 880               mask_found |= CODING_CATEGORY_MASK_ISO;
 881               break;
 882             }
 883           else
 884             /* Invalid escape sequence.  Just ignore.  */
 885             break;
 886
 887           /* We found a valid designation sequence for CHARSET.  */
 888           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 889           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 890             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 891           else
 892             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 893           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 894             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 895           else
 896             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 897           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 898             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 899           else
 900             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 901           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 902             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 903           else
 904             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 905           break;
 906
 907         case ISO_CODE_SO:
 908           single_shifting = 0;
 909           if (shift_out == 0
 910               && (reg[1] >= 0
 911                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 912                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 913             {
 914               /* Locking shift out.  */
 915               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 916               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 917             }
 918           break;
 919
 920         case ISO_CODE_SI:
 921           single_shifting = 0;
 922           if (shift_out == 1)
 923             {
 924               /* Locking shift in.  */
 925               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 926               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 927             }
 928           break;
 929
 930         case ISO_CODE_CSI:
 931           single_shifting = 0;
 932         case ISO_CODE_SS2:
 933         case ISO_CODE_SS3:
 934           {
 935             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 936
 937             if (c != ISO_CODE_CSI)
 938               {
 939                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 940                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 941                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 942                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 943                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 944                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 945                 single_shifting = 1;
 946               }
 947             if (VECTORP (Vlatin_extra_code_table)
 948                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 949               {
 950                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 951                     & CODING_FLAG_ISO_LATIN_EXTRA)
 952                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 953                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 954                     & CODING_FLAG_ISO_LATIN_EXTRA)
 955                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 956               }
 957             mask &= newmask;
 958             mask_found |= newmask;
 959           }
 960           break;
 961
 962         default:
 963           if (c < 0x80)
 964             {
 965               single_shifting = 0;
 966               break;
 967             }
 968           else if (c < 0xA0)
 969             {
 970               single_shifting = 0;
 971               if (VECTORP (Vlatin_extra_code_table)
 972                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 973                 {
 974                   int newmask = 0;
 975
 976                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 977                       & CODING_FLAG_ISO_LATIN_EXTRA)
 978                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 979                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 980                       & CODING_FLAG_ISO_LATIN_EXTRA)
 981                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 982                   mask &= newmask;
 983                   mask_found |= newmask;
 984                 }
 985               else
 986                 return 0;
 987             }
 988           else
 989             {
 990               unsigned char *src_begin = src;
 991
 992               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 993                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 994               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 995               /* Check the length of succeeding codes of the range
 996                  0xA0..0FF.  If the byte length is odd, we exclude
 997                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 998                  when we are not single shifting.  */
 999               if (!single_shifting
1000                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1001                 {
1002                   int i = 0;
1003                   while (src < src_end)
1004                     {
1005                       ONE_MORE_BYTE (c);
1006                       if (c < 0xA0)
1007                         break;
1008                       i++;
1009                     }
1010
1011                   if (i & 1 && src < src_end)
1012                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1013                   else
1014                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1015                 }
1016             }
1017           break;
1018         }
1019     }
1020  label_end_of_loop:
1021   return (mask & mask_found);
1022 }
1023
1024 /* Decode a character of which charset is CHARSET, the 1st position
1025    code is C1, the 2nd position code is C2, and return the decoded
1026    character code.  If the variable `translation_table' is non-nil,
1027    returned the translated code.  */
1028
1029 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1030   (NILP (translation_table)                     \
1031    ? MAKE_CHAR (charset, c1, c2)                \
1032    : translate_char (translation_table, -1, charset, c1, c2))
1033
1034 /* Set designation state into CODING.  */
1035 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1036   do {                                                                     \
1037     int charset;                                                           \
1038                                                                            \
1039     if (final_char < '0' || final_char >= 128)                             \
1040       goto label_invalid_code;                                             \
1041     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1042                                  make_number (chars),                      \
1043                                  make_number (final_char));                \
1044     if (charset >= 0                                                       \
1045         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1046             || coding->safe_charsets[charset]))                            \
1047       {                                                                    \
1048         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1049             && reg == 0                                                    \
1050             && charset == CHARSET_ASCII)                                   \
1051           {                                                                \
1052             /* We should insert this designation sequence as is so         \
1053                that it is surely written back to a file.  */               \
1054             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1055             goto label_invalid_code;                                       \
1056           }                                                                \
1057         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1058         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1059             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1060           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1061         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1062       }                                                                    \
1063     else                                                                   \
1064       {                                                                    \
1065         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1066         goto label_invalid_code;                                           \
1067       }                                                                    \
1068   } while (0)
1069
1070 /* Allocate a memory block for storing information about compositions.
1071    The block is chained to the already allocated blocks.  */
1072
1073 static void
1074 coding_allocate_composition_data (coding, char_offset)
1075      struct coding_system *coding;
1076      int char_offset;
1077 {
1078   struct composition_data *cmp_data
1079     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1080
1081   cmp_data->char_offset = char_offset;
1082   cmp_data->used = 0;
1083   cmp_data->prev = coding->cmp_data;
1084   cmp_data->next = NULL;
1085   if (coding->cmp_data)
1086     coding->cmp_data->next = cmp_data;
1087   coding->cmp_data = cmp_data;
1088   coding->cmp_data_start = 0;
1089 }
1090
1091 /* Record the starting position START and METHOD of one composition.  */
1092
1093 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1094   do {                                                          \
1095     struct composition_data *cmp_data = coding->cmp_data;       \
1096     int *data = cmp_data->data + cmp_data->used;                \
1097     coding->cmp_data_start = cmp_data->used;                    \
1098     data[0] = -1;                                               \
1099     data[1] = cmp_data->char_offset + start;                    \
1100     data[3] = (int) method;                                     \
1101     cmp_data->used += 4;                                        \
1102   } while (0)
1103
1104 /* Record the ending position END of the current composition.  */
1105
1106 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1107   do {                                                          \
1108     struct composition_data *cmp_data = coding->cmp_data;       \
1109     int *data = cmp_data->data + coding->cmp_data_start;        \
1110     data[0] = cmp_data->used - coding->cmp_data_start;          \
1111     data[2] = cmp_data->char_offset + end;                      \
1112   } while (0)
1113
1114 /* Record one COMPONENT (alternate character or composition rule).  */
1115
1116 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1117   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1118
1119 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1120
1121 #define DECODE_COMPOSITION_START(c1)                                    \
1122   do {                                                                  \
1123     if (coding->composing == COMPOSITION_DISABLED)                      \
1124       {                                                                 \
1125         *dst++ = ISO_CODE_ESC;                                          \
1126         *dst++ = c1 & 0x7f;                                             \
1127         coding->produced_char += 2;                                     \
1128       }                                                                 \
1129     else if (!COMPOSING_P (coding))                                     \
1130       {                                                                 \
1131         /* This is surely the start of a composition.  We must be sure  \
1132            that coding->cmp_data has enough space to store the          \
1133            information about the composition.  If not, terminate the    \
1134            current decoding loop, allocate one more memory block for    \
1135            coding->cmp_data in the calller, then start the decoding     \
1136            loop again.  We can't allocate memory here directly because  \
1137            it may cause buffer/string relocation.  */                   \
1138         if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH  \
1139             >= COMPOSITION_DATA_SIZE)                                   \
1140           {                                                             \
1141             coding->result = CODING_FINISH_INSUFFICIENT_CMP;            \
1142             goto label_end_of_loop;                                     \
1143           }                                                             \
1144         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE           \
1145                              : c1 == '2' ? COMPOSITION_WITH_RULE        \
1146                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS    \
1147                              : COMPOSITION_WITH_RULE_ALTCHARS);         \
1148         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,    \
1149                                       coding->composing);               \
1150         coding->composition_rule_follows = 0;                           \
1151       }                                                                 \
1152     else                                                                \
1153       {                                                                 \
1154         /* We are already handling a composition.  If the method is     \
1155            the following two, the codes following the current escape    \
1156            sequence are actual characters stored in a buffer.  */       \
1157         if (coding->composing == COMPOSITION_WITH_ALTCHARS              \
1158             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)     \
1159           {                                                             \
1160             coding->composing = COMPOSITION_RELATIVE;                   \
1161             coding->composition_rule_follows = 0;                       \
1162           }                                                             \
1163       }                                                                 \
1164   } while (0)
1165
1166 /* Handle compositoin end sequence ESC 1.  */
1167
1168 #define DECODE_COMPOSITION_END(c1)                                      \
1169   do {                                                                  \
1170     if (coding->composing == COMPOSITION_DISABLED)                      \
1171       {                                                                 \
1172         *dst++ = ISO_CODE_ESC;                                          \
1173         *dst++ = c1;                                                    \
1174         coding->produced_char += 2;                                     \
1175       }                                                                 \
1176     else                                                                \
1177       {                                                                 \
1178         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1179         coding->composing = COMPOSITION_NO;                             \
1180       }                                                                 \
1181   } while (0)
1182
1183 /* Decode a composition rule from the byte C1 (and maybe one more byte
1184    from SRC) and store one encoded composition rule in
1185    coding->cmp_data.  */
1186
1187 #define DECODE_COMPOSITION_RULE(c1)                                     \
1188   do {                                                                  \
1189     int rule = 0;                                                       \
1190     (c1) -= 32;                                                         \
1191     if (c1 < 81)                /* old format (before ver.21) */        \
1192       {                                                                 \
1193         int gref = (c1) / 9;                                            \
1194         int nref = (c1) % 9;                                            \
1195         if (gref == 4) gref = 10;                                       \
1196         if (nref == 4) nref = 10;                                       \
1197         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1198       }                                                                 \
1199     else if (c1 < 93)           /* new format (after ver.21) */         \
1200       {                                                                 \
1201         ONE_MORE_BYTE (c2);                                             \
1202         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1203       }                                                                 \
1204     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1205     coding->composition_rule_follows = 0;                               \
1206   } while (0)
1207
1208
1209 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1210
1211 static void
1212 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1213      struct coding_system *coding;
1214      unsigned char *source, *destination;
1215      int src_bytes, dst_bytes;
1216 {
1217   unsigned char *src = source;
1218   unsigned char *src_end = source + src_bytes;
1219   unsigned char *dst = destination;
1220   unsigned char *dst_end = destination + dst_bytes;
1221   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1222   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1223   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1224   /* SRC_BASE remembers the start position in source in each loop.
1225      The loop will be exited when there's not enough source code
1226      (within macro ONE_MORE_BYTE), or when there's not enough
1227      destination area to produce a character (within macro
1228      EMIT_CHAR).  */
1229   unsigned char *src_base;
1230   int c, charset;
1231   Lisp_Object translation_table;
1232
1233   if (NILP (Venable_character_translation))
1234     translation_table = Qnil;
1235   else
1236     {
1237       translation_table = coding->translation_table_for_decode;
1238       if (NILP (translation_table))
1239         translation_table = Vstandard_translation_table_for_decode;
1240     }
1241
1242   coding->result = CODING_FINISH_NORMAL;
1243
1244   while (1)
1245     {
1246       int c1, c2;
1247
1248       src_base = src;
1249       ONE_MORE_BYTE (c1);
1250
1251       /* We produce no character or one character.  */
1252       switch (iso_code_class [c1])
1253         {
1254         case ISO_0x20_or_0x7F:
1255           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1256             {
1257               DECODE_COMPOSITION_RULE (c1);
1258               continue;
1259             }
1260           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1261             {
1262               /* This is SPACE or DEL.  */
1263               charset = CHARSET_ASCII;
1264               break;
1265             }
1266           /* This is a graphic character, we fall down ...  */
1267
1268         case ISO_graphic_plane_0:
1269           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1270             {
1271               DECODE_COMPOSITION_RULE (c1);
1272               continue;
1273             }
1274           charset = charset0;
1275           break;
1276
1277         case ISO_0xA0_or_0xFF:
1278           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1279               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1280             goto label_invalid_code;
1281           /* This is a graphic character, we fall down ... */
1282
1283         case ISO_graphic_plane_1:
1284           if (charset1 < 0)
1285             goto label_invalid_code;
1286           charset = charset1;
1287           break;
1288
1289         case ISO_control_0:
1290           if (COMPOSING_P (coding))
1291             DECODE_COMPOSITION_END ('1');
1292
1293           /* All ISO2022 control characters in this class have the
1294              same representation in Emacs internal format.  */
1295           if (c1 == '\n'
1296               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1297               && (coding->eol_type == CODING_EOL_CR
1298                   || coding->eol_type == CODING_EOL_CRLF))
1299             {
1300               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1301               goto label_end_of_loop;
1302             }
1303           charset = CHARSET_ASCII;
1304           break;
1305
1306         case ISO_control_1:
1307           if (COMPOSING_P (coding))
1308             DECODE_COMPOSITION_END ('1');
1309           goto label_invalid_code;
1310
1311         case ISO_carriage_return:
1312           if (COMPOSING_P (coding))
1313             DECODE_COMPOSITION_END ('1');
1314
1315           if (coding->eol_type == CODING_EOL_CR)
1316             c1 = '\n';
1317           else if (coding->eol_type == CODING_EOL_CRLF)
1318             {
1319               ONE_MORE_BYTE (c1);
1320               if (c1 != ISO_CODE_LF)
1321                 {
1322                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1323                     {
1324                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1325                       goto label_end_of_loop;
1326                     }
1327                   src--;
1328                   c1 = '\r';
1329                 }
1330             }
1331           charset = CHARSET_ASCII;
1332           break;
1333
1334         case ISO_shift_out:
1335           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1336               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1337             goto label_invalid_code;
1338           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1339           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1340           continue;
1341
1342         case ISO_shift_in:
1343           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1344             goto label_invalid_code;
1345           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1346           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1347           continue;
1348
1349         case ISO_single_shift_2_7:
1350         case ISO_single_shift_2:
1351           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1352             goto label_invalid_code;
1353           /* SS2 is handled as an escape sequence of ESC 'N' */
1354           c1 = 'N';
1355           goto label_escape_sequence;
1356
1357         case ISO_single_shift_3:
1358           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1359             goto label_invalid_code;
1360           /* SS2 is handled as an escape sequence of ESC 'O' */
1361           c1 = 'O';
1362           goto label_escape_sequence;
1363
1364         case ISO_control_sequence_introducer:
1365           /* CSI is handled as an escape sequence of ESC '[' ...  */
1366           c1 = '[';
1367           goto label_escape_sequence;
1368
1369         case ISO_escape:
1370           ONE_MORE_BYTE (c1);
1371         label_escape_sequence:
1372           /* Escape sequences handled by Emacs are invocation,
1373              designation, direction specification, and character
1374              composition specification.  */
1375           switch (c1)
1376             {
1377             case '&':           /* revision of following character set */
1378               ONE_MORE_BYTE (c1);
1379               if (!(c1 >= '@' && c1 <= '~'))
1380                 goto label_invalid_code;
1381               ONE_MORE_BYTE (c1);
1382               if (c1 != ISO_CODE_ESC)
1383                 goto label_invalid_code;
1384               ONE_MORE_BYTE (c1);
1385               goto label_escape_sequence;
1386
1387             case '$':           /* designation of 2-byte character set */
1388               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1389                 goto label_invalid_code;
1390               ONE_MORE_BYTE (c1);
1391               if (c1 >= '@' && c1 <= 'B')
1392                 {       /* designation of JISX0208.1978, GB2312.1980,
1393                            or JISX0208.1980 */
1394                   DECODE_DESIGNATION (0, 2, 94, c1);
1395                 }
1396               else if (c1 >= 0x28 && c1 <= 0x2B)
1397                 {       /* designation of DIMENSION2_CHARS94 character set */
1398                   ONE_MORE_BYTE (c2);
1399                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1400                 }
1401               else if (c1 >= 0x2C && c1 <= 0x2F)
1402                 {       /* designation of DIMENSION2_CHARS96 character set */
1403                   ONE_MORE_BYTE (c2);
1404                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1405                 }
1406               else
1407                 goto label_invalid_code;
1408               /* We must update these variables now.  */
1409               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1410               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1411               continue;
1412
1413             case 'n':           /* invocation of locking-shift-2 */
1414               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1415                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1416                 goto label_invalid_code;
1417               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1418               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1419               continue;
1420
1421             case 'o':           /* invocation of locking-shift-3 */
1422               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1423                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1424                 goto label_invalid_code;
1425               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1426               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1427               continue;
1428
1429             case 'N':           /* invocation of single-shift-2 */
1430               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1431                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1432                 goto label_invalid_code;
1433               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1434               ONE_MORE_BYTE (c1);
1435               break;
1436
1437             case 'O':           /* invocation of single-shift-3 */
1438               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1439                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1440                 goto label_invalid_code;
1441               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1442               ONE_MORE_BYTE (c1);
1443               break;
1444
1445             case '0': case '2': case '3': case '4': /* start composition */
1446               DECODE_COMPOSITION_START (c1);
1447               continue;
1448
1449             case '1':           /* end composition */
1450               DECODE_COMPOSITION_END (c1);
1451               continue;
1452
1453             case '[':           /* specification of direction */
1454               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1455                 goto label_invalid_code;
1456               /* For the moment, nested direction is not supported.
1457                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1458                  left-to-right, and nozero means right-to-left.  */
1459               ONE_MORE_BYTE (c1);
1460               switch (c1)
1461                 {
1462                 case ']':       /* end of the current direction */
1463                   coding->mode &= ~CODING_MODE_DIRECTION;
1464
1465                 case '0':       /* end of the current direction */
1466                 case '1':       /* start of left-to-right direction */
1467                   ONE_MORE_BYTE (c1);
1468                   if (c1 == ']')
1469                     coding->mode &= ~CODING_MODE_DIRECTION;
1470                   else
1471                     goto label_invalid_code;
1472                   break;
1473
1474                 case '2':       /* start of right-to-left direction */
1475                   ONE_MORE_BYTE (c1);
1476                   if (c1 == ']')
1477                     coding->mode |= CODING_MODE_DIRECTION;
1478                   else
1479                     goto label_invalid_code;
1480                   break;
1481
1482                 default:
1483                   goto label_invalid_code;
1484                 }
1485               continue;
1486
1487             default:
1488               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1489                 goto label_invalid_code;
1490               if (c1 >= 0x28 && c1 <= 0x2B)
1491                 {       /* designation of DIMENSION1_CHARS94 character set */
1492                   ONE_MORE_BYTE (c2);
1493                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1494                 }
1495               else if (c1 >= 0x2C && c1 <= 0x2F)
1496                 {       /* designation of DIMENSION1_CHARS96 character set */
1497                   ONE_MORE_BYTE (c2);
1498                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1499                 }
1500               else
1501                 goto label_invalid_code;
1502               /* We must update these variables now.  */
1503               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1504               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1505               continue;
1506             }
1507         }
1508
1509       /* Now we know CHARSET and 1st position code C1 of a character.
1510          Produce a multibyte sequence for that character while getting
1511          2nd position code C2 if necessary.  */
1512       if (CHARSET_DIMENSION (charset) == 2)
1513         {
1514           ONE_MORE_BYTE (c2);
1515           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1516             /* C2 is not in a valid range.  */
1517             goto label_invalid_code;
1518         }
1519       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1520       EMIT_CHAR (c);
1521       continue;
1522
1523     label_invalid_code:
1524       coding->errors++;
1525       if (COMPOSING_P (coding))
1526         DECODE_COMPOSITION_END ('1');
1527       src = src_base;
1528       c = *src++;
1529       EMIT_CHAR (c);
1530     }
1531
1532  label_end_of_loop:
1533   coding->consumed = coding->consumed_char = src_base - source;
1534   coding->produced = dst - destination;
1535   return;
1536 }
1537
1538
1539 /* ISO2022 encoding stuff.  */
1540
1541 /*
1542    It is not enough to say just "ISO2022" on encoding, we have to
1543    specify more details.  In Emacs, each coding system of ISO2022
1544    variant has the following specifications:
1545         1. Initial designation to G0 thru G3.
1546         2. Allows short-form designation?
1547         3. ASCII should be designated to G0 before control characters?
1548         4. ASCII should be designated to G0 at end of line?
1549         5. 7-bit environment or 8-bit environment?
1550         6. Use locking-shift?
1551         7. Use Single-shift?
1552    And the following two are only for Japanese:
1553         8. Use ASCII in place of JIS0201-1976-Roman?
1554         9. Use JISX0208-1983 in place of JISX0208-1978?
1555    These specifications are encoded in `coding->flags' as flag bits
1556    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1557    details.
1558 */
1559
1560 /* Produce codes (escape sequence) for designating CHARSET to graphic
1561    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1562    '@', 'A', or 'B' and the coding system CODING allows, produce
1563    designation sequence of short-form.  */
1564
1565 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1566   do {                                                                  \
1567     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1568     char *intermediate_char_94 = "()*+";                                \
1569     char *intermediate_char_96 = ",-./";                                \
1570     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1571                                                                         \
1572     if (revision < 255)                                                 \
1573       {                                                                 \
1574         *dst++ = ISO_CODE_ESC;                                          \
1575         *dst++ = '&';                                                   \
1576         *dst++ = '@' + revision;                                        \
1577       }                                                                 \
1578     *dst++ = ISO_CODE_ESC;                                              \
1579     if (CHARSET_DIMENSION (charset) == 1)                               \
1580       {                                                                 \
1581         if (CHARSET_CHARS (charset) == 94)                              \
1582           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1583         else                                                            \
1584           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1585       }                                                                 \
1586     else                                                                \
1587       {                                                                 \
1588         *dst++ = '$';                                                   \
1589         if (CHARSET_CHARS (charset) == 94)                              \
1590           {                                                             \
1591             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1592                 || reg != 0                                             \
1593                 || final_char < '@' || final_char > 'B')                \
1594               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1595           }                                                             \
1596         else                                                            \
1597           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1598       }                                                                 \
1599     *dst++ = final_char;                                                \
1600     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1601   } while (0)
1602
1603 /* The following two macros produce codes (control character or escape
1604    sequence) for ISO2022 single-shift functions (single-shift-2 and
1605    single-shift-3).  */
1606
1607 #define ENCODE_SINGLE_SHIFT_2                           \
1608   do {                                                  \
1609     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1610       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1611     else                                                \
1612       *dst++ = ISO_CODE_SS2;                            \
1613     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1614   } while (0)
1615
1616 #define ENCODE_SINGLE_SHIFT_3                           \
1617   do {                                                  \
1618     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1619       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1620     else                                                \
1621       *dst++ = ISO_CODE_SS3;                            \
1622     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1623   } while (0)
1624
1625 /* The following four macros produce codes (control character or
1626    escape sequence) for ISO2022 locking-shift functions (shift-in,
1627    shift-out, locking-shift-2, and locking-shift-3).  */
1628
1629 #define ENCODE_SHIFT_IN                         \
1630   do {                                          \
1631     *dst++ = ISO_CODE_SI;                       \
1632     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1633   } while (0)
1634
1635 #define ENCODE_SHIFT_OUT                        \
1636   do {                                          \
1637     *dst++ = ISO_CODE_SO;                       \
1638     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1639   } while (0)
1640
1641 #define ENCODE_LOCKING_SHIFT_2                  \
1642   do {                                          \
1643     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1644     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1645   } while (0)
1646
1647 #define ENCODE_LOCKING_SHIFT_3                  \
1648   do {                                          \
1649     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1650     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1651   } while (0)
1652
1653 /* Produce codes for a DIMENSION1 character whose character set is
1654    CHARSET and whose position-code is C1.  Designation and invocation
1655    sequences are also produced in advance if necessary.  */
1656
1657 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1658   do {                                                                  \
1659     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1660       {                                                                 \
1661         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1662           *dst++ = c1 & 0x7F;                                           \
1663         else                                                            \
1664           *dst++ = c1 | 0x80;                                           \
1665         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1666         break;                                                          \
1667       }                                                                 \
1668     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1669       {                                                                 \
1670         *dst++ = c1 & 0x7F;                                             \
1671         break;                                                          \
1672       }                                                                 \
1673     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1674       {                                                                 \
1675         *dst++ = c1 | 0x80;                                             \
1676         break;                                                          \
1677       }                                                                 \
1678     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1679              && !coding->safe_charsets[charset])                        \
1680       {                                                                 \
1681         /* We should not encode this character, instead produce one or  \
1682            two `?'s.  */                                                \
1683         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1684         if (CHARSET_WIDTH (charset) == 2)                               \
1685           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1686         break;                                                          \
1687       }                                                                 \
1688     else                                                                \
1689       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1690          must invoke it, or, at first, designate it to some graphic     \
1691          register.  Then repeat the loop to actually produce the        \
1692          character.  */                                                 \
1693       dst = encode_invocation_designation (charset, coding, dst);       \
1694   } while (1)
1695
1696 /* Produce codes for a DIMENSION2 character whose character set is
1697    CHARSET and whose position-codes are C1 and C2.  Designation and
1698    invocation codes are also produced in advance if necessary.  */
1699
1700 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1701   do {                                                                  \
1702     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1703       {                                                                 \
1704         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1705           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1706         else                                                            \
1707           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1708         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1709         break;                                                          \
1710       }                                                                 \
1711     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1712       {                                                                 \
1713         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1714         break;                                                          \
1715       }                                                                 \
1716     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1717       {                                                                 \
1718         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1719         break;                                                          \
1720       }                                                                 \
1721     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1722              && !coding->safe_charsets[charset])                        \
1723       {                                                                 \
1724         /* We should not encode this character, instead produce one or  \
1725            two `?'s.  */                                                \
1726         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1727         if (CHARSET_WIDTH (charset) == 2)                               \
1728           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1729         break;                                                          \
1730       }                                                                 \
1731     else                                                                \
1732       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1733          must invoke it, or, at first, designate it to some graphic     \
1734          register.  Then repeat the loop to actually produce the        \
1735          character.  */                                                 \
1736       dst = encode_invocation_designation (charset, coding, dst);       \
1737   } while (1)
1738
1739 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                           \
1740   do {                                                                  \
1741     int alt_charset = charset;                                          \
1742                                                                         \
1743     if (CHARSET_DEFINED_P (charset))                                    \
1744       {                                                                 \
1745         if (CHARSET_DIMENSION (charset) == 1)                           \
1746           {                                                             \
1747             if (charset == CHARSET_ASCII                                \
1748                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)           \
1749               alt_charset = charset_latin_jisx0201;                     \
1750             ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1);          \
1751           }                                                             \
1752         else                                                            \
1753           {                                                             \
1754             if (charset == charset_jisx0208                             \
1755                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)          \
1756               alt_charset = charset_jisx0208_1978;                      \
1757             ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2);      \
1758           }                                                             \
1759       }                                                                 \
1760     else                                                                \
1761       {                                                                 \
1762         *dst++ = c1;                                                    \
1763         if (c2 >= 0)                                                    \
1764           *dst++ = c2;                                                  \
1765       }                                                                 \
1766   } while (0)
1767
1768 /* Produce designation and invocation codes at a place pointed by DST
1769    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1770    Return new DST.  */
1771
1772 unsigned char *
1773 encode_invocation_designation (charset, coding, dst)
1774      int charset;
1775      struct coding_system *coding;
1776      unsigned char *dst;
1777 {
1778   int reg;                      /* graphic register number */
1779
1780   /* At first, check designations.  */
1781   for (reg = 0; reg < 4; reg++)
1782     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1783       break;
1784
1785   if (reg >= 4)
1786     {
1787       /* CHARSET is not yet designated to any graphic registers.  */
1788       /* At first check the requested designation.  */
1789       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1790       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1791         /* Since CHARSET requests no special designation, designate it
1792            to graphic register 0.  */
1793         reg = 0;
1794
1795       ENCODE_DESIGNATION (charset, reg, coding);
1796     }
1797
1798   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1799       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1800     {
1801       /* Since the graphic register REG is not invoked to any graphic
1802          planes, invoke it to graphic plane 0.  */
1803       switch (reg)
1804         {
1805         case 0:                 /* graphic register 0 */
1806           ENCODE_SHIFT_IN;
1807           break;
1808
1809         case 1:                 /* graphic register 1 */
1810           ENCODE_SHIFT_OUT;
1811           break;
1812
1813         case 2:                 /* graphic register 2 */
1814           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1815             ENCODE_SINGLE_SHIFT_2;
1816           else
1817             ENCODE_LOCKING_SHIFT_2;
1818           break;
1819
1820         case 3:                 /* graphic register 3 */
1821           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1822             ENCODE_SINGLE_SHIFT_3;
1823           else
1824             ENCODE_LOCKING_SHIFT_3;
1825           break;
1826         }
1827     }
1828
1829   return dst;
1830 }
1831
1832 /* Produce 2-byte codes for encoded composition rule RULE.  */
1833
1834 #define ENCODE_COMPOSITION_RULE(rule)           \
1835   do {                                          \
1836     int gref, nref;                             \
1837     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1838     *dst++ = 32 + 81 + gref;                    \
1839     *dst++ = 32 + nref;                         \
1840   } while (0)
1841
1842 /* Produce codes for indicating the start of a composition sequence
1843    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1844    which specify information about the composition.  See the comment
1845    in coding.h for the format of DATA.  */
1846
1847 #define ENCODE_COMPOSITION_START(coding, data)                          \
1848   do {                                                                  \
1849     coding->composing = data[3];                                        \
1850     *dst++ = ISO_CODE_ESC;                                              \
1851     if (coding->composing == COMPOSITION_RELATIVE)                      \
1852       *dst++ = '0';                                                     \
1853     else                                                                \
1854       {                                                                 \
1855         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1856                   ? '3' : '4');                                         \
1857         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1858         coding->composition_rule_follows = 0;                           \
1859       }                                                                 \
1860   } while (0)
1861
1862 /* Produce codes for indicating the end of the current composition.  */
1863
1864 #define ENCODE_COMPOSITION_END(coding, data)                    \
1865   do {                                                          \
1866     *dst++ = ISO_CODE_ESC;                                      \
1867     *dst++ = '1';                                               \
1868     coding->cmp_data_start += data[0];                          \
1869     coding->composing = COMPOSITION_NO;                         \
1870     if (coding->cmp_data_start == coding->cmp_data->used        \
1871         && coding->cmp_data->next)                              \
1872       {                                                         \
1873         coding->cmp_data = coding->cmp_data->next;              \
1874         coding->cmp_data_start = 0;                             \
1875       }                                                         \
1876   } while (0)
1877
1878 /* Produce composition start sequence ESC 0.  Here, this sequence
1879    doesn't mean the start of a new composition but means that we have
1880    just produced components (alternate chars and composition rules) of
1881    the composition and the actual text follows in SRC.  */
1882
1883 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1884   do {                                          \
1885     *dst++ = ISO_CODE_ESC;                      \
1886     *dst++ = '0';                               \
1887     coding->composing = COMPOSITION_RELATIVE;   \
1888   } while (0)
1889
1890 /* The following three macros produce codes for indicating direction
1891    of text.  */
1892 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1893   do {                                                  \
1894     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1895       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1896     else                                                \
1897       *dst++ = ISO_CODE_CSI;                            \
1898   } while (0)
1899
1900 #define ENCODE_DIRECTION_R2L    \
1901   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1902
1903 #define ENCODE_DIRECTION_L2R    \
1904   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1905
1906 /* Produce codes for designation and invocation to reset the graphic
1907    planes and registers to initial state.  */
1908 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1909   do {                                                                      \
1910     int reg;                                                                \
1911     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1912       ENCODE_SHIFT_IN;                                                      \
1913     for (reg = 0; reg < 4; reg++)                                           \
1914       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1915           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1916               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1917         ENCODE_DESIGNATION                                                  \
1918           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1919   } while (0)
1920
1921 /* Produce designation sequences of charsets in the line started from
1922    SRC to a place pointed by DST, and return updated DST.
1923
1924    If the current block ends before any end-of-line, we may fail to
1925    find all the necessary designations.  */
1926
1927 static unsigned char *
1928 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1929      struct coding_system *coding;
1930      Lisp_Object translation_table;
1931      unsigned char *src, *src_end, *dst;
1932 {
1933   int charset, c, found = 0, reg;
1934   /* Table of charsets to be designated to each graphic register.  */
1935   int r[4];
1936
1937   for (reg = 0; reg < 4; reg++)
1938     r[reg] = -1;
1939
1940   while (found < 4)
1941     {
1942       ONE_MORE_CHAR (c);
1943       if (c == '\n')
1944         break;
1945
1946       charset = CHAR_CHARSET (c);
1947       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1948       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1949         {
1950           found++;
1951           r[reg] = charset;
1952         }
1953     }
1954
1955  label_end_of_loop:
1956   if (found)
1957     {
1958       for (reg = 0; reg < 4; reg++)
1959         if (r[reg] >= 0
1960             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1961           ENCODE_DESIGNATION (r[reg], reg, coding);
1962     }
1963
1964   return dst;
1965 }
1966
1967 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1968
1969 static void
1970 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1971      struct coding_system *coding;
1972      unsigned char *source, *destination;
1973      int src_bytes, dst_bytes;
1974 {
1975   unsigned char *src = source;
1976   unsigned char *src_end = source + src_bytes;
1977   unsigned char *dst = destination;
1978   unsigned char *dst_end = destination + dst_bytes;
1979   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1980      from DST_END to assure overflow checking is necessary only at the
1981      head of loop.  */
1982   unsigned char *adjusted_dst_end = dst_end - 19;
1983   /* SRC_BASE remembers the start position in source in each loop.
1984      The loop will be exited when there's not enough source text to
1985      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1986      there's not enough destination area to produce encoded codes
1987      (within macro EMIT_BYTES).  */
1988   unsigned char *src_base;
1989   int c;
1990   Lisp_Object translation_table;
1991
1992   if (NILP (Venable_character_translation))
1993     translation_table = Qnil;
1994   else
1995     {
1996       translation_table = coding->translation_table_for_encode;
1997       if (NILP (translation_table))
1998         translation_table = Vstandard_translation_table_for_encode;
1999     }
2000
2001   coding->consumed_char = 0;
2002   coding->errors = 0;
2003   while (1)
2004     {
2005       int charset, c1, c2;
2006
2007       src_base = src;
2008
2009       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2010         {
2011           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2012           break;
2013         }
2014
2015       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2016           && CODING_SPEC_ISO_BOL (coding))
2017         {
2018           /* We have to produce designation sequences if any now.  */
2019           dst = encode_designation_at_bol (coding, translation_table,
2020                                            src, src_end, dst);
2021           CODING_SPEC_ISO_BOL (coding) = 0;
2022         }
2023
2024       /* Check composition start and end.  */
2025       if (coding->composing != COMPOSITION_DISABLED
2026           && coding->cmp_data_start < coding->cmp_data->used)
2027         {
2028           struct composition_data *cmp_data = coding->cmp_data;
2029           int *data = cmp_data->data + coding->cmp_data_start;
2030           int this_pos = cmp_data->char_offset + coding->consumed_char;
2031
2032           if (coding->composing == COMPOSITION_RELATIVE)
2033             {
2034               if (this_pos == data[2])
2035                 {
2036                   ENCODE_COMPOSITION_END (coding, data);
2037                   cmp_data = coding->cmp_data;
2038                   data = cmp_data->data + coding->cmp_data_start;
2039                 }
2040             }
2041           else if (COMPOSING_P (coding))
2042             {
2043               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2044               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2045                 /* We have consumed components of the composition.
2046                    What follows in SRC is the compositions's base
2047                    text.  */
2048                 ENCODE_COMPOSITION_FAKE_START (coding);
2049               else
2050                 {
2051                   int c = cmp_data->data[coding->cmp_data_index++];
2052                   if (coding->composition_rule_follows)
2053                     {
2054                       ENCODE_COMPOSITION_RULE (c);
2055                       coding->composition_rule_follows = 0;
2056                     }
2057                   else
2058                     {
2059                       SPLIT_CHAR (c, charset, c1, c2);
2060                       ENCODE_ISO_CHARACTER (charset, c1, c2);
2061                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2062                         coding->composition_rule_follows = 1;
2063                     }
2064                   continue;
2065                 }
2066             }
2067           if (!COMPOSING_P (coding))
2068             {
2069               if (this_pos == data[1])
2070                 {
2071                   ENCODE_COMPOSITION_START (coding, data);
2072                   continue;
2073                 }
2074             }
2075         }
2076
2077       ONE_MORE_CHAR (c);
2078
2079       /* Now encode the character C.  */
2080       if (c < 0x20 || c == 0x7F)
2081         {
2082           if (c == '\r')
2083             {
2084               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2085                 {
2086                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2087                     ENCODE_RESET_PLANE_AND_REGISTER;
2088                   *dst++ = c;
2089                   continue;
2090                 }
2091               /* fall down to treat '\r' as '\n' ...  */
2092               c = '\n';
2093             }
2094           if (c == '\n')
2095             {
2096               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2097                 ENCODE_RESET_PLANE_AND_REGISTER;
2098               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2099                 bcopy (coding->spec.iso2022.initial_designation,
2100                        coding->spec.iso2022.current_designation,
2101                        sizeof coding->spec.iso2022.initial_designation);
2102               if (coding->eol_type == CODING_EOL_LF
2103                   || coding->eol_type == CODING_EOL_UNDECIDED)
2104                 *dst++ = ISO_CODE_LF;
2105               else if (coding->eol_type == CODING_EOL_CRLF)
2106                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2107               else
2108                 *dst++ = ISO_CODE_CR;
2109               CODING_SPEC_ISO_BOL (coding) = 1;
2110             }
2111           else
2112             {
2113               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2114                 ENCODE_RESET_PLANE_AND_REGISTER;
2115               *dst++ = c;
2116             }
2117         }
2118       else if (ASCII_BYTE_P (c))
2119         ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2120       else if (SINGLE_BYTE_CHAR_P (c))
2121         {
2122           *dst++ = c;
2123           coding->errors++;
2124         }
2125       else
2126         {
2127           SPLIT_CHAR (c, charset, c1, c2);
2128           ENCODE_ISO_CHARACTER (charset, c1, c2);
2129         }
2130
2131       coding->consumed_char++;
2132     }
2133
2134  label_end_of_loop:
2135   coding->consumed = src_base - source;
2136   coding->produced = coding->produced_char = dst - destination;
2137 }
2138
2139 \f
2140 /*** 4. SJIS and BIG5 handlers ***/
2141
2142 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2143    quite widely.  So, for the moment, Emacs supports them in the bare
2144    C code.  But, in the future, they may be supported only by CCL.  */
2145
2146 /* SJIS is a coding system encoding three character sets: ASCII, right
2147    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2148    as is.  A character of charset katakana-jisx0201 is encoded by
2149    "position-code + 0x80".  A character of charset japanese-jisx0208
2150    is encoded in 2-byte but two position-codes are divided and shifted
2151    so that it fit in the range below.
2152
2153    --- CODE RANGE of SJIS ---
2154    (character set)      (range)
2155    ASCII                0x00 .. 0x7F
2156    KATAKANA-JISX0201    0xA0 .. 0xDF
2157    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2158             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2159    -------------------------------
2160
2161 */
2162
2163 /* BIG5 is a coding system encoding two character sets: ASCII and
2164    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2165    character set and is encoded in two-byte.
2166
2167    --- CODE RANGE of BIG5 ---
2168    (character set)      (range)
2169    ASCII                0x00 .. 0x7F
2170    Big5 (1st byte)      0xA1 .. 0xFE
2171         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2172    --------------------------
2173
2174    Since the number of characters in Big5 is larger than maximum
2175    characters in Emacs' charset (96x96), it can't be handled as one
2176    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2177    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2178    contains frequently used characters and the latter contains less
2179    frequently used characters.  */
2180
2181 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2182    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2183    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2184    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2185
2186 /* Number of Big5 characters which have the same code in 1st byte.  */
2187 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2188
2189 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2190   do {                                                                  \
2191     unsigned int temp                                                   \
2192       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2193     if (b1 < 0xC9)                                                      \
2194       charset = charset_big5_1;                                         \
2195     else                                                                \
2196       {                                                                 \
2197         charset = charset_big5_2;                                       \
2198         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2199       }                                                                 \
2200     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2201     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2202   } while (0)
2203
2204 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2205   do {                                                                  \
2206     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2207     if (charset == charset_big5_2)                                      \
2208       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2209     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2210     b2 = temp % BIG5_SAME_ROW;                                          \
2211     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2212   } while (0)
2213
2214 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2215    Check if a text is encoded in SJIS.  If it is, return
2216    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2217
2218 int
2219 detect_coding_sjis (src, src_end)
2220      unsigned char *src, *src_end;
2221 {
2222   int c;
2223   /* Dummy for ONE_MORE_BYTE.  */
2224   struct coding_system dummy_coding;
2225   struct coding_system *coding = &dummy_coding;
2226
2227   while (1)
2228     {
2229       ONE_MORE_BYTE (c);
2230       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2231         {
2232           ONE_MORE_BYTE (c);
2233           if (c < 0x40)
2234             return 0;
2235         }
2236     }
2237  label_end_of_loop:
2238   return CODING_CATEGORY_MASK_SJIS;
2239 }
2240
2241 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2242    Check if a text is encoded in BIG5.  If it is, return
2243    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2244
2245 int
2246 detect_coding_big5 (src, src_end)
2247      unsigned char *src, *src_end;
2248 {
2249   int c;
2250   /* Dummy for ONE_MORE_BYTE.  */
2251   struct coding_system dummy_coding;
2252   struct coding_system *coding = &dummy_coding;
2253
2254   while (1)
2255     {
2256       ONE_MORE_BYTE (c);
2257       if (c >= 0xA1)
2258         {
2259           ONE_MORE_BYTE (c);
2260           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2261             return 0;
2262         }
2263     }
2264  label_end_of_loop:
2265   return CODING_CATEGORY_MASK_BIG5;
2266 }
2267
2268 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2269    Check if a text is encoded in UTF-8.  If it is, return
2270    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2271
2272 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2273 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2274 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2275 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2276 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2277 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2278 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2279
2280 int
2281 detect_coding_utf_8 (src, src_end)
2282      unsigned char *src, *src_end;
2283 {
2284   unsigned char c;
2285   int seq_maybe_bytes;
2286   /* Dummy for ONE_MORE_BYTE.  */
2287   struct coding_system dummy_coding;
2288   struct coding_system *coding = &dummy_coding;
2289
2290   while (1)
2291     {
2292       ONE_MORE_BYTE (c);
2293       if (UTF_8_1_OCTET_P (c))
2294         continue;
2295       else if (UTF_8_2_OCTET_LEADING_P (c))
2296         seq_maybe_bytes = 1;
2297       else if (UTF_8_3_OCTET_LEADING_P (c))
2298         seq_maybe_bytes = 2;
2299       else if (UTF_8_4_OCTET_LEADING_P (c))
2300         seq_maybe_bytes = 3;
2301       else if (UTF_8_5_OCTET_LEADING_P (c))
2302         seq_maybe_bytes = 4;
2303       else if (UTF_8_6_OCTET_LEADING_P (c))
2304         seq_maybe_bytes = 5;
2305       else
2306         return 0;
2307
2308       do
2309         {
2310           ONE_MORE_BYTE (c);
2311           if (!UTF_8_EXTRA_OCTET_P (c))
2312             return 0;
2313           seq_maybe_bytes--;
2314         }
2315       while (seq_maybe_bytes > 0);
2316     }
2317
2318  label_end_of_loop:
2319   return CODING_CATEGORY_MASK_UTF_8;
2320 }
2321
2322 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2323    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2324    Little Endian (otherwise).  If it is, return
2325    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2326    else return 0.  */
2327
2328 #define UTF_16_INVALID_P(val)   \
2329   (((val) == 0xFFFE)            \
2330    || ((val) == 0xFFFF))
2331
2332 #define UTF_16_HIGH_SURROGATE_P(val) \
2333   (((val) & 0xD800) == 0xD800)
2334
2335 #define UTF_16_LOW_SURROGATE_P(val) \
2336   (((val) & 0xDC00) == 0xDC00)
2337
2338 int
2339 detect_coding_utf_16 (src, src_end)
2340      unsigned char *src, *src_end;
2341 {
2342   unsigned char c1, c2;
2343   /* Dummy for TWO_MORE_BYTES.  */
2344   struct coding_system dummy_coding;
2345   struct coding_system *coding = &dummy_coding;
2346
2347   TWO_MORE_BYTES (c1, c2);
2348
2349   if ((c1 == 0xFF) && (c2 == 0xFE))
2350     return CODING_CATEGORY_MASK_UTF_16_LE;
2351   else if ((c1 == 0xFE) && (c2 == 0xFF))
2352     return CODING_CATEGORY_MASK_UTF_16_BE;
2353
2354  label_end_of_loop:
2355   return 0;
2356 }
2357
2358 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2359    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2360
2361 static void
2362 decode_coding_sjis_big5 (coding, source, destination,
2363                          src_bytes, dst_bytes, sjis_p)
2364      struct coding_system *coding;
2365      unsigned char *source, *destination;
2366      int src_bytes, dst_bytes;
2367      int sjis_p;
2368 {
2369   unsigned char *src = source;
2370   unsigned char *src_end = source + src_bytes;
2371   unsigned char *dst = destination;
2372   unsigned char *dst_end = destination + dst_bytes;
2373   /* SRC_BASE remembers the start position in source in each loop.
2374      The loop will be exited when there's not enough source code
2375      (within macro ONE_MORE_BYTE), or when there's not enough
2376      destination area to produce a character (within macro
2377      EMIT_CHAR).  */
2378   unsigned char *src_base;
2379   Lisp_Object translation_table;
2380
2381   if (NILP (Venable_character_translation))
2382     translation_table = Qnil;
2383   else
2384     {
2385       translation_table = coding->translation_table_for_decode;
2386       if (NILP (translation_table))
2387         translation_table = Vstandard_translation_table_for_decode;
2388     }
2389
2390   coding->produced_char = 0;
2391   while (1)
2392     {
2393       int c, charset, c1, c2;
2394
2395       src_base = src;
2396       ONE_MORE_BYTE (c1);
2397
2398       if (c1 < 0x80)
2399         {
2400           charset = CHARSET_ASCII;
2401           if (c1 < 0x20)
2402             {
2403               if (c1 == '\r')
2404                 {
2405                   if (coding->eol_type == CODING_EOL_CRLF)
2406                     {
2407                       ONE_MORE_BYTE (c2);
2408                       if (c2 == '\n')
2409                         c1 = c2;
2410                       else if (coding->mode
2411                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2412                         {
2413                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2414                           goto label_end_of_loop;
2415                         }
2416                       else
2417                         /* To process C2 again, SRC is subtracted by 1.  */
2418                         src--;
2419                     }
2420                   else if (coding->eol_type == CODING_EOL_CR)
2421                     c1 = '\n';
2422                 }
2423               else if (c1 == '\n'
2424                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2425                        && (coding->eol_type == CODING_EOL_CR
2426                            || coding->eol_type == CODING_EOL_CRLF))
2427                 {
2428                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2429                   goto label_end_of_loop;
2430                 }
2431             }
2432         }
2433       else
2434         {
2435           if (sjis_p)
2436             {
2437               if (c1 >= 0xF0)
2438                 goto label_invalid_code;
2439               if (c1 < 0xA0 || c1 >= 0xE0)
2440                 {
2441                   /* SJIS -> JISX0208 */
2442                   ONE_MORE_BYTE (c2);
2443                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2444                     goto label_invalid_code;
2445                   DECODE_SJIS (c1, c2, c1, c2);
2446                   charset = charset_jisx0208;
2447                 }
2448               else
2449                 /* SJIS -> JISX0201-Kana */
2450                 charset = charset_katakana_jisx0201;
2451             }
2452           else
2453             {
2454               /* BIG5 -> Big5 */
2455               if (c1 < 0xA1 || c1 > 0xFE)
2456                 goto label_invalid_code;
2457               ONE_MORE_BYTE (c2);
2458               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2459                 goto label_invalid_code;
2460               DECODE_BIG5 (c1, c2, charset, c1, c2);
2461             }
2462         }
2463
2464       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2465       EMIT_CHAR (c);
2466       continue;
2467
2468     label_invalid_code:
2469       coding->errors++;
2470       src = src_base;
2471       c = *src++;
2472       EMIT_CHAR (c);
2473     }
2474
2475  label_end_of_loop:
2476   coding->consumed = coding->consumed_char = src_base - source;
2477   coding->produced = dst - destination;
2478   return;
2479 }
2480
2481 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2482    This function can encode charsets `ascii', `katakana-jisx0201',
2483    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2484    are sure that all these charsets are registered as official charset
2485    (i.e. do not have extended leading-codes).  Characters of other
2486    charsets are produced without any encoding.  If SJIS_P is 1, encode
2487    SJIS text, else encode BIG5 text.  */
2488
2489 static void
2490 encode_coding_sjis_big5 (coding, source, destination,
2491                          src_bytes, dst_bytes, sjis_p)
2492      struct coding_system *coding;
2493      unsigned char *source, *destination;
2494      int src_bytes, dst_bytes;
2495      int sjis_p;
2496 {
2497   unsigned char *src = source;
2498   unsigned char *src_end = source + src_bytes;
2499   unsigned char *dst = destination;
2500   unsigned char *dst_end = destination + dst_bytes;
2501   /* SRC_BASE remembers the start position in source in each loop.
2502      The loop will be exited when there's not enough source text to
2503      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2504      there's not enough destination area to produce encoded codes
2505      (within macro EMIT_BYTES).  */
2506   unsigned char *src_base;
2507   Lisp_Object translation_table;
2508
2509   if (NILP (Venable_character_translation))
2510     translation_table = Qnil;
2511   else
2512     {
2513       translation_table = coding->translation_table_for_decode;
2514       if (NILP (translation_table))
2515         translation_table = Vstandard_translation_table_for_decode;
2516     }
2517
2518   while (1)
2519     {
2520       int c, charset, c1, c2;
2521
2522       src_base = src;
2523       ONE_MORE_CHAR (c);
2524
2525       /* Now encode the character C.  */
2526       if (SINGLE_BYTE_CHAR_P (c))
2527         {
2528           switch (c)
2529             {
2530             case '\r':
2531               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2532                 {
2533                   EMIT_ONE_BYTE (c);
2534                   break;
2535                 }
2536               c = '\n';
2537             case '\n':
2538               if (coding->eol_type == CODING_EOL_CRLF)
2539                 {
2540                   EMIT_TWO_BYTES ('\r', c);
2541                   break;
2542                 }
2543               else if (coding->eol_type == CODING_EOL_CR)
2544                 c = '\r';
2545             default:
2546               EMIT_ONE_BYTE (c);
2547             }
2548         }
2549       else
2550         {
2551           SPLIT_CHAR (c, charset, c1, c2);
2552           if (sjis_p)
2553             {
2554               if (charset == charset_jisx0208
2555                   || charset == charset_jisx0208_1978)
2556                 {
2557                   ENCODE_SJIS (c1, c2, c1, c2);
2558                   EMIT_TWO_BYTES (c1, c2);
2559                 }
2560               else if (charset == charset_latin_jisx0201)
2561                 EMIT_ONE_BYTE (c1);
2562               else
2563                 /* There's no way other than producing the internal
2564                    codes as is.  */
2565                 EMIT_BYTES (src_base, src);
2566             }
2567           else
2568             {
2569               if (charset == charset_big5_1 || charset == charset_big5_2)
2570                 {
2571                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2572                   EMIT_TWO_BYTES (c1, c2);
2573                 }
2574               else
2575                 /* There's no way other than producing the internal
2576                    codes as is.  */
2577                 EMIT_BYTES (src_base, src);
2578             }
2579         }
2580       coding->consumed_char++;
2581     }
2582
2583  label_end_of_loop:
2584   coding->consumed = src_base - source;
2585   coding->produced = coding->produced_char = dst - destination;
2586 }
2587
2588 \f
2589 /*** 5. CCL handlers ***/
2590
2591 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2592    Check if a text is encoded in a coding system of which
2593    encoder/decoder are written in CCL program.  If it is, return
2594    CODING_CATEGORY_MASK_CCL, else return 0.  */
2595
2596 int
2597 detect_coding_ccl (src, src_end)
2598      unsigned char *src, *src_end;
2599 {
2600   unsigned char *valid;
2601   int c;
2602   /* Dummy for ONE_MORE_BYTE.  */
2603   struct coding_system dummy_coding;
2604   struct coding_system *coding = &dummy_coding;
2605
2606   /* No coding system is assigned to coding-category-ccl.  */
2607   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2608     return 0;
2609
2610   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2611   while (1)
2612     {
2613       ONE_MORE_BYTE (c);
2614       if (! valid[c])
2615         return 0;
2616     }
2617  label_end_of_loop:
2618   return CODING_CATEGORY_MASK_CCL;
2619 }
2620
2621 \f
2622 /*** 6. End-of-line handlers ***/
2623
2624 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2625
2626 static void
2627 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2628      struct coding_system *coding;
2629      unsigned char *source, *destination;
2630      int src_bytes, dst_bytes;
2631 {
2632   unsigned char *src = source;
2633   unsigned char *dst = destination;
2634   unsigned char *src_end = src + src_bytes;
2635   unsigned char *dst_end = dst + dst_bytes;
2636   Lisp_Object translation_table;
2637   /* SRC_BASE remembers the start position in source in each loop.
2638      The loop will be exited when there's not enough source code
2639      (within macro ONE_MORE_BYTE), or when there's not enough
2640      destination area to produce a character (within macro
2641      EMIT_CHAR).  */
2642   unsigned char *src_base;
2643   int c;
2644
2645   translation_table = Qnil;
2646   switch (coding->eol_type)
2647     {
2648     case CODING_EOL_CRLF:
2649       while (1)
2650         {
2651           src_base = src;
2652           ONE_MORE_BYTE (c);
2653           if (c == '\r')
2654             {
2655               ONE_MORE_BYTE (c);
2656               if (c != '\n')
2657                 {
2658                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2659                     {
2660                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2661                       goto label_end_of_loop;
2662                     }
2663                   src--;
2664                   c = '\r';
2665                 }
2666             }
2667           else if (c == '\n'
2668                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2669             {
2670               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2671               goto label_end_of_loop;
2672             }
2673           EMIT_CHAR (c);
2674         }
2675       break;
2676
2677     case CODING_EOL_CR:
2678       while (1)
2679         {
2680           src_base = src;
2681           ONE_MORE_BYTE (c);
2682           if (c == '\n')
2683             {
2684               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2685                 {
2686                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2687                   goto label_end_of_loop;
2688                 }
2689             }
2690           else if (c == '\r')
2691             c = '\n';
2692           EMIT_CHAR (c);
2693         }
2694       break;
2695
2696     default:                    /* no need for EOL handling */
2697       while (1)
2698         {
2699           src_base = src;
2700           ONE_MORE_BYTE (c);
2701           EMIT_CHAR (c);
2702         }
2703     }
2704
2705  label_end_of_loop:
2706   coding->consumed = coding->consumed_char = src_base - source;
2707   coding->produced = dst - destination;
2708   return;
2709 }
2710
2711 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2712    format of end-of-line according to `coding->eol_type'.  It also
2713    convert multibyte form 8-bit characers to unibyte if
2714    CODING->src_multibyte is nonzero.  If `coding->mode &
2715    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2716    also means end-of-line.  */
2717
2718 static void
2719 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2720      struct coding_system *coding;
2721      unsigned char *source, *destination;
2722      int src_bytes, dst_bytes;
2723 {
2724   unsigned char *src = source;
2725   unsigned char *dst = destination;
2726   unsigned char *src_end = src + src_bytes;
2727   unsigned char *dst_end = dst + dst_bytes;
2728   Lisp_Object translation_table;
2729   /* SRC_BASE remembers the start position in source in each loop.
2730      The loop will be exited when there's not enough source text to
2731      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2732      there's not enough destination area to produce encoded codes
2733      (within macro EMIT_BYTES).  */
2734   unsigned char *src_base;
2735   int c;
2736   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2737
2738   translation_table = Qnil;
2739   if (coding->src_multibyte
2740       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2741     {
2742       src_end--;
2743       src_bytes--;
2744       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2745     }
2746
2747   if (coding->eol_type == CODING_EOL_CRLF)
2748     {
2749       while (src < src_end)
2750         {
2751           src_base = src;
2752           c = *src++;
2753           if (c >= 0x20)
2754             EMIT_ONE_BYTE (c);
2755           else if (c == '\n' || (c == '\r' && selective_display))
2756             EMIT_TWO_BYTES ('\r', '\n');
2757           else
2758             EMIT_ONE_BYTE (c);
2759         }
2760       src_base = src;
2761     label_end_of_loop:
2762       ;
2763     }
2764   else
2765     {
2766       if (src_bytes <= dst_bytes)
2767         {
2768           safe_bcopy (src, dst, src_bytes);
2769           src_base = src_end;
2770           dst += src_bytes;
2771         }
2772       else
2773         {
2774           if (coding->src_multibyte
2775               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2776             dst_bytes--;
2777           safe_bcopy (src, dst, dst_bytes);
2778           src_base = src + dst_bytes;
2779           dst = destination + dst_bytes;
2780           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2781         }
2782       if (coding->eol_type == CODING_EOL_CR)
2783         {
2784           for (src = destination; src < dst; src++)
2785             if (*src == '\n') *src = '\r';
2786         }
2787       else if (selective_display)
2788         {
2789           for (src = destination; src < dst; src++)
2790             if (*src == '\r') *src = '\n';
2791         }
2792     }
2793   if (coding->src_multibyte)
2794     dst = destination + str_as_unibyte (destination, dst - destination);
2795
2796   coding->consumed = src_base - source;
2797   coding->produced = dst - destination;
2798 }
2799
2800 \f
2801 /*** 7. C library functions ***/
2802
2803 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2804    has a property `coding-system'.  The value of this property is a
2805    vector of length 5 (called as coding-vector).  Among elements of
2806    this vector, the first (element[0]) and the fifth (element[4])
2807    carry important information for decoding/encoding.  Before
2808    decoding/encoding, this information should be set in fields of a
2809    structure of type `coding_system'.
2810
2811    A value of property `coding-system' can be a symbol of another
2812    subsidiary coding-system.  In that case, Emacs gets coding-vector
2813    from that symbol.
2814
2815    `element[0]' contains information to be set in `coding->type'.  The
2816    value and its meaning is as follows:
2817
2818    0 -- coding_type_emacs_mule
2819    1 -- coding_type_sjis
2820    2 -- coding_type_iso2022
2821    3 -- coding_type_big5
2822    4 -- coding_type_ccl encoder/decoder written in CCL
2823    nil -- coding_type_no_conversion
2824    t -- coding_type_undecided (automatic conversion on decoding,
2825                                no-conversion on encoding)
2826
2827    `element[4]' contains information to be set in `coding->flags' and
2828    `coding->spec'.  The meaning varies by `coding->type'.
2829
2830    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2831    of length 32 (of which the first 13 sub-elements are used now).
2832    Meanings of these sub-elements are:
2833
2834    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2835         If the value is an integer of valid charset, the charset is
2836         assumed to be designated to graphic register N initially.
2837
2838         If the value is minus, it is a minus value of charset which
2839         reserves graphic register N, which means that the charset is
2840         not designated initially but should be designated to graphic
2841         register N just before encoding a character in that charset.
2842
2843         If the value is nil, graphic register N is never used on
2844         encoding.
2845
2846    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2847         Each value takes t or nil.  See the section ISO2022 of
2848         `coding.h' for more information.
2849
2850    If `coding->type' is `coding_type_big5', element[4] is t to denote
2851    BIG5-ETen or nil to denote BIG5-HKU.
2852
2853    If `coding->type' takes the other value, element[4] is ignored.
2854
2855    Emacs Lisp's coding system also carries information about format of
2856    end-of-line in a value of property `eol-type'.  If the value is
2857    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2858    means CODING_EOL_CR.  If it is not integer, it should be a vector
2859    of subsidiary coding systems of which property `eol-type' has one
2860    of above values.
2861
2862 */
2863
2864 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2865    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2866    is setup so that no conversion is necessary and return -1, else
2867    return 0.  */
2868
2869 int
2870 setup_coding_system (coding_system, coding)
2871      Lisp_Object coding_system;
2872      struct coding_system *coding;
2873 {
2874   Lisp_Object coding_spec, coding_type, eol_type, plist;
2875   Lisp_Object val;
2876   int i;
2877
2878   /* Initialize some fields required for all kinds of coding systems.  */
2879   coding->symbol = coding_system;
2880   coding->common_flags = 0;
2881   coding->mode = 0;
2882   coding->heading_ascii = -1;
2883   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2884   coding->composing = COMPOSITION_DISABLED;
2885   coding->cmp_data = NULL;
2886
2887   if (NILP (coding_system))
2888     goto label_invalid_coding_system;
2889
2890   coding_spec = Fget (coding_system, Qcoding_system);
2891
2892   if (!VECTORP (coding_spec)
2893       || XVECTOR (coding_spec)->size != 5
2894       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2895     goto label_invalid_coding_system;
2896
2897   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2898   if (VECTORP (eol_type))
2899     {
2900       coding->eol_type = CODING_EOL_UNDECIDED;
2901       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2902     }
2903   else if (XFASTINT (eol_type) == 1)
2904     {
2905       coding->eol_type = CODING_EOL_CRLF;
2906       coding->common_flags
2907         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2908     }
2909   else if (XFASTINT (eol_type) == 2)
2910     {
2911       coding->eol_type = CODING_EOL_CR;
2912       coding->common_flags
2913         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2914     }
2915   else
2916     coding->eol_type = CODING_EOL_LF;
2917
2918   coding_type = XVECTOR (coding_spec)->contents[0];
2919   /* Try short cut.  */
2920   if (SYMBOLP (coding_type))
2921     {
2922       if (EQ (coding_type, Qt))
2923         {
2924           coding->type = coding_type_undecided;
2925           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2926         }
2927       else
2928         coding->type = coding_type_no_conversion;
2929       return 0;
2930     }
2931
2932   /* Get values of coding system properties:
2933      `post-read-conversion', `pre-write-conversion',
2934      `translation-table-for-decode', `translation-table-for-encode'.  */
2935   plist = XVECTOR (coding_spec)->contents[3];
2936   /* Pre & post conversion functions should be disabled if
2937      inhibit_eol_conversion is nozero.  This is the case that a code
2938      conversion function is called while those functions are running.  */
2939   if (! inhibit_pre_post_conversion)
2940     {
2941       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2942       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2943     }
2944   val = Fplist_get (plist, Qtranslation_table_for_decode);
2945   if (SYMBOLP (val))
2946     val = Fget (val, Qtranslation_table_for_decode);
2947   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2948   val = Fplist_get (plist, Qtranslation_table_for_encode);
2949   if (SYMBOLP (val))
2950     val = Fget (val, Qtranslation_table_for_encode);
2951   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2952   val = Fplist_get (plist, Qcoding_category);
2953   if (!NILP (val))
2954     {
2955       val = Fget (val, Qcoding_category_index);
2956       if (INTEGERP (val))
2957         coding->category_idx = XINT (val);
2958       else
2959         goto label_invalid_coding_system;
2960     }
2961   else
2962     goto label_invalid_coding_system;
2963
2964   val = Fplist_get (plist, Qsafe_charsets);
2965   if (EQ (val, Qt))
2966     {
2967       for (i = 0; i <= MAX_CHARSET; i++)
2968         coding->safe_charsets[i] = 1;
2969     }
2970   else
2971     {
2972       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2973       while (CONSP (val))
2974         {
2975           if ((i = get_charset_id (XCAR (val))) >= 0)
2976             coding->safe_charsets[i] = 1;
2977           val = XCDR (val);
2978         }
2979     }
2980
2981   /* If the coding system has non-nil `composition' property, enable
2982      composition handling.  */
2983   val = Fplist_get (plist, Qcomposition);
2984   if (!NILP (val))
2985     coding->composing = COMPOSITION_NO;
2986
2987   switch (XFASTINT (coding_type))
2988     {
2989     case 0:
2990       coding->type = coding_type_emacs_mule;
2991       if (!NILP (coding->post_read_conversion))
2992         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2993       if (!NILP (coding->pre_write_conversion))
2994         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2995       break;
2996
2997     case 1:
2998       coding->type = coding_type_sjis;
2999       coding->common_flags
3000         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3001       break;
3002
3003     case 2:
3004       coding->type = coding_type_iso2022;
3005       coding->common_flags
3006         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3007       {
3008         Lisp_Object val, temp;
3009         Lisp_Object *flags;
3010         int i, charset, reg_bits = 0;
3011
3012         val = XVECTOR (coding_spec)->contents[4];
3013
3014         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3015           goto label_invalid_coding_system;
3016
3017         flags = XVECTOR (val)->contents;
3018         coding->flags
3019           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3020              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3021              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3022              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3023              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3024              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3025              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3026              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3027              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3028              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3029              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3030              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3031              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3032              );
3033
3034         /* Invoke graphic register 0 to plane 0.  */
3035         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3036         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3037         CODING_SPEC_ISO_INVOCATION (coding, 1)
3038           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3039         /* Not single shifting at first.  */
3040         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3041         /* Beginning of buffer should also be regarded as bol. */
3042         CODING_SPEC_ISO_BOL (coding) = 1;
3043
3044         for (charset = 0; charset <= MAX_CHARSET; charset++)
3045           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3046         val = Vcharset_revision_alist;
3047         while (CONSP (val))
3048           {
3049             charset = get_charset_id (Fcar_safe (XCAR (val)));
3050             if (charset >= 0
3051                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3052                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3053               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3054             val = XCDR (val);
3055           }
3056
3057         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3058            FLAGS[REG] can be one of below:
3059                 integer CHARSET: CHARSET occupies register I,
3060                 t: designate nothing to REG initially, but can be used
3061                   by any charsets,
3062                 list of integer, nil, or t: designate the first
3063                   element (if integer) to REG initially, the remaining
3064                   elements (if integer) is designated to REG on request,
3065                   if an element is t, REG can be used by any charsets,
3066                 nil: REG is never used.  */
3067         for (charset = 0; charset <= MAX_CHARSET; charset++)
3068           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3069             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3070         for (i = 0; i < 4; i++)
3071           {
3072             if (INTEGERP (flags[i])
3073                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3074                 || (charset = get_charset_id (flags[i])) >= 0)
3075               {
3076                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3077                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3078               }
3079             else if (EQ (flags[i], Qt))
3080               {
3081                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3082                 reg_bits |= 1 << i;
3083                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3084               }
3085             else if (CONSP (flags[i]))
3086               {
3087                 Lisp_Object tail;
3088                 tail = flags[i];
3089
3090                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3091                 if (INTEGERP (XCAR (tail))
3092                     && (charset = XINT (XCAR (tail)),
3093                         CHARSET_VALID_P (charset))
3094                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3095                   {
3096                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3097                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3098                   }
3099                 else
3100                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3101                 tail = XCDR (tail);
3102                 while (CONSP (tail))
3103                   {
3104                     if (INTEGERP (XCAR (tail))
3105                         && (charset = XINT (XCAR (tail)),
3106                             CHARSET_VALID_P (charset))
3107                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3108                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3109                         = i;
3110                     else if (EQ (XCAR (tail), Qt))
3111                       reg_bits |= 1 << i;
3112                     tail = XCDR (tail);
3113                   }
3114               }
3115             else
3116               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3117
3118             CODING_SPEC_ISO_DESIGNATION (coding, i)
3119               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3120           }
3121
3122         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3123           {
3124             /* REG 1 can be used only by locking shift in 7-bit env.  */
3125             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3126               reg_bits &= ~2;
3127             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3128               /* Without any shifting, only REG 0 and 1 can be used.  */
3129               reg_bits &= 3;
3130           }
3131
3132         if (reg_bits)
3133           for (charset = 0; charset <= MAX_CHARSET; charset++)
3134             {
3135               if (CHARSET_VALID_P (charset))
3136                 {
3137                   /* There exist some default graphic registers to be
3138                      used CHARSET.  */
3139
3140                   /* We had better avoid designating a charset of
3141                      CHARS96 to REG 0 as far as possible.  */
3142                   if (CHARSET_CHARS (charset) == 96)
3143                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3144                       = (reg_bits & 2
3145                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3146                   else
3147                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3148                       = (reg_bits & 1
3149                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3150                 }
3151             }
3152       }
3153       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3154       coding->spec.iso2022.last_invalid_designation_register = -1;
3155       break;
3156
3157     case 3:
3158       coding->type = coding_type_big5;
3159       coding->common_flags
3160         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3161       coding->flags
3162         = (NILP (XVECTOR (coding_spec)->contents[4])
3163            ? CODING_FLAG_BIG5_HKU
3164            : CODING_FLAG_BIG5_ETEN);
3165       break;
3166
3167     case 4:
3168       coding->type = coding_type_ccl;
3169       coding->common_flags
3170         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3171       {
3172         val = XVECTOR (coding_spec)->contents[4];
3173         if (! CONSP (val)
3174             || setup_ccl_program (&(coding->spec.ccl.decoder),
3175                                   XCAR (val)) < 0
3176             || setup_ccl_program (&(coding->spec.ccl.encoder),
3177                                   XCDR (val)) < 0)
3178           goto label_invalid_coding_system;
3179
3180         bzero (coding->spec.ccl.valid_codes, 256);
3181         val = Fplist_get (plist, Qvalid_codes);
3182         if (CONSP (val))
3183           {
3184             Lisp_Object this;
3185
3186             for (; CONSP (val); val = XCDR (val))
3187               {
3188                 this = XCAR (val);
3189                 if (INTEGERP (this)
3190                     && XINT (this) >= 0 && XINT (this) < 256)
3191                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3192                 else if (CONSP (this)
3193                          && INTEGERP (XCAR (this))
3194                          && INTEGERP (XCDR (this)))
3195                   {
3196                     int start = XINT (XCAR (this));
3197                     int end = XINT (XCDR (this));
3198
3199                     if (start >= 0 && start <= end && end < 256)
3200                       while (start <= end)
3201                         coding->spec.ccl.valid_codes[start++] = 1;
3202                   }
3203               }
3204           }
3205       }
3206       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3207       break;
3208
3209     case 5:
3210       coding->type = coding_type_raw_text;
3211       break;
3212
3213     default:
3214       goto label_invalid_coding_system;
3215     }
3216   return 0;
3217
3218  label_invalid_coding_system:
3219   coding->type = coding_type_no_conversion;
3220   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3221   coding->common_flags = 0;
3222   coding->eol_type = CODING_EOL_LF;
3223   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3224   return -1;
3225 }
3226
3227 /* Free memory blocks allocated for storing composition information.  */
3228
3229 void
3230 coding_free_composition_data (coding)
3231      struct coding_system *coding;
3232 {
3233   struct composition_data *cmp_data = coding->cmp_data, *next;
3234
3235   if (!cmp_data)
3236     return;
3237   /* Memory blocks are chained.  At first, rewind to the first, then,
3238      free blocks one by one.  */
3239   while (cmp_data->prev)
3240     cmp_data = cmp_data->prev;
3241   while (cmp_data)
3242     {
3243       next = cmp_data->next;
3244       xfree (cmp_data);
3245       cmp_data = next;
3246     }
3247   coding->cmp_data = NULL;
3248 }
3249
3250 /* Set `char_offset' member of all memory blocks pointed by
3251    coding->cmp_data to POS.  */
3252
3253 void
3254 coding_adjust_composition_offset (coding, pos)
3255      struct coding_system *coding;
3256      int pos;
3257 {
3258   struct composition_data *cmp_data;
3259
3260   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3261     cmp_data->char_offset = pos;
3262 }
3263
3264 /* Setup raw-text or one of its subsidiaries in the structure
3265    coding_system CODING according to the already setup value eol_type
3266    in CODING.  CODING should be setup for some coding system in
3267    advance.  */
3268
3269 void
3270 setup_raw_text_coding_system (coding)
3271      struct coding_system *coding;
3272 {
3273   if (coding->type != coding_type_raw_text)
3274     {
3275       coding->symbol = Qraw_text;
3276       coding->type = coding_type_raw_text;
3277       if (coding->eol_type != CODING_EOL_UNDECIDED)
3278         {
3279           Lisp_Object subsidiaries;
3280           subsidiaries = Fget (Qraw_text, Qeol_type);
3281
3282           if (VECTORP (subsidiaries)
3283               && XVECTOR (subsidiaries)->size == 3)
3284             coding->symbol
3285               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3286         }
3287       setup_coding_system (coding->symbol, coding);
3288     }
3289   return;
3290 }
3291
3292 /* Emacs has a mechanism to automatically detect a coding system if it
3293    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3294    it's impossible to distinguish some coding systems accurately
3295    because they use the same range of codes.  So, at first, coding
3296    systems are categorized into 7, those are:
3297
3298    o coding-category-emacs-mule
3299
3300         The category for a coding system which has the same code range
3301         as Emacs' internal format.  Assigned the coding-system (Lisp
3302         symbol) `emacs-mule' by default.
3303
3304    o coding-category-sjis
3305
3306         The category for a coding system which has the same code range
3307         as SJIS.  Assigned the coding-system (Lisp
3308         symbol) `japanese-shift-jis' by default.
3309
3310    o coding-category-iso-7
3311
3312         The category for a coding system which has the same code range
3313         as ISO2022 of 7-bit environment.  This doesn't use any locking
3314         shift and single shift functions.  This can encode/decode all
3315         charsets.  Assigned the coding-system (Lisp symbol)
3316         `iso-2022-7bit' by default.
3317
3318    o coding-category-iso-7-tight
3319
3320         Same as coding-category-iso-7 except that this can
3321         encode/decode only the specified charsets.
3322
3323    o coding-category-iso-8-1
3324
3325         The category for a coding system which has the same code range
3326         as ISO2022 of 8-bit environment and graphic plane 1 used only
3327         for DIMENSION1 charset.  This doesn't use any locking shift
3328         and single shift functions.  Assigned the coding-system (Lisp
3329         symbol) `iso-latin-1' by default.
3330
3331    o coding-category-iso-8-2
3332
3333         The category for a coding system which has the same code range
3334         as ISO2022 of 8-bit environment and graphic plane 1 used only
3335         for DIMENSION2 charset.  This doesn't use any locking shift
3336         and single shift functions.  Assigned the coding-system (Lisp
3337         symbol) `japanese-iso-8bit' by default.
3338
3339    o coding-category-iso-7-else
3340
3341         The category for a coding system which has the same code range
3342         as ISO2022 of 7-bit environemnt but uses locking shift or
3343         single shift functions.  Assigned the coding-system (Lisp
3344         symbol) `iso-2022-7bit-lock' by default.
3345
3346    o coding-category-iso-8-else
3347
3348         The category for a coding system which has the same code range
3349         as ISO2022 of 8-bit environemnt but uses locking shift or
3350         single shift functions.  Assigned the coding-system (Lisp
3351         symbol) `iso-2022-8bit-ss2' by default.
3352
3353    o coding-category-big5
3354
3355         The category for a coding system which has the same code range
3356         as BIG5.  Assigned the coding-system (Lisp symbol)
3357         `cn-big5' by default.
3358
3359    o coding-category-utf-8
3360
3361         The category for a coding system which has the same code range
3362         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3363         symbol) `utf-8' by default.
3364
3365    o coding-category-utf-16-be
3366
3367         The category for a coding system in which a text has an
3368         Unicode signature (cf. Unicode Standard) in the order of BIG
3369         endian at the head.  Assigned the coding-system (Lisp symbol)
3370         `utf-16-be' by default.
3371
3372    o coding-category-utf-16-le
3373
3374         The category for a coding system in which a text has an
3375         Unicode signature (cf. Unicode Standard) in the order of
3376         LITTLE endian at the head.  Assigned the coding-system (Lisp
3377         symbol) `utf-16-le' by default.
3378
3379    o coding-category-ccl
3380
3381         The category for a coding system of which encoder/decoder is
3382         written in CCL programs.  The default value is nil, i.e., no
3383         coding system is assigned.
3384
3385    o coding-category-binary
3386
3387         The category for a coding system not categorized in any of the
3388         above.  Assigned the coding-system (Lisp symbol)
3389         `no-conversion' by default.
3390
3391    Each of them is a Lisp symbol and the value is an actual
3392    `coding-system's (this is also a Lisp symbol) assigned by a user.
3393    What Emacs does actually is to detect a category of coding system.
3394    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3395    decide only one possible category, it selects a category of the
3396    highest priority.  Priorities of categories are also specified by a
3397    user in a Lisp variable `coding-category-list'.
3398
3399 */
3400
3401 static
3402 int ascii_skip_code[256];
3403
3404 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3405    If it detects possible coding systems, return an integer in which
3406    appropriate flag bits are set.  Flag bits are defined by macros
3407    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3408    it should point the table `coding_priorities'.  In that case, only
3409    the flag bit for a coding system of the highest priority is set in
3410    the returned value.
3411
3412    How many ASCII characters are at the head is returned as *SKIP.  */
3413
3414 static int
3415 detect_coding_mask (source, src_bytes, priorities, skip)
3416      unsigned char *source;
3417      int src_bytes, *priorities, *skip;
3418 {
3419   register unsigned char c;
3420   unsigned char *src = source, *src_end = source + src_bytes;
3421   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3422   int i, idx;
3423
3424   /* At first, skip all ASCII characters and control characters except
3425      for three ISO2022 specific control characters.  */
3426   ascii_skip_code[ISO_CODE_SO] = 0;
3427   ascii_skip_code[ISO_CODE_SI] = 0;
3428   ascii_skip_code[ISO_CODE_ESC] = 0;
3429
3430  label_loop_detect_coding:
3431   while (src < src_end && ascii_skip_code[*src]) src++;
3432   *skip = src - source;
3433
3434   if (src >= src_end)
3435     /* We found nothing other than ASCII.  There's nothing to do.  */
3436     return 0;
3437
3438   c = *src;
3439   /* The text seems to be encoded in some multilingual coding system.
3440      Now, try to find in which coding system the text is encoded.  */
3441   if (c < 0x80)
3442     {
3443       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3444       /* C is an ISO2022 specific control code of C0.  */
3445       mask = detect_coding_iso2022 (src, src_end);
3446       if (mask == 0)
3447         {
3448           /* No valid ISO2022 code follows C.  Try again.  */
3449           src++;
3450           if (c == ISO_CODE_ESC)
3451             ascii_skip_code[ISO_CODE_ESC] = 1;
3452           else
3453             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3454           goto label_loop_detect_coding;
3455         }
3456       if (priorities)
3457         {
3458           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3459             {
3460               if (mask & priorities[i])
3461                 return priorities[i];
3462             }
3463           return CODING_CATEGORY_MASK_RAW_TEXT;
3464         }
3465     }
3466   else
3467     {
3468       int try;
3469
3470       if (c < 0xA0)
3471         {
3472           /* C is the first byte of SJIS character code,
3473              or a leading-code of Emacs' internal format (emacs-mule),
3474              or the first byte of UTF-16.  */
3475           try = (CODING_CATEGORY_MASK_SJIS
3476                   | CODING_CATEGORY_MASK_EMACS_MULE
3477                   | CODING_CATEGORY_MASK_UTF_16_BE
3478                   | CODING_CATEGORY_MASK_UTF_16_LE);
3479
3480           /* Or, if C is a special latin extra code,
3481              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3482              or is an ISO2022 control-sequence-introducer (CSI),
3483              we should also consider the possibility of ISO2022 codings.  */
3484           if ((VECTORP (Vlatin_extra_code_table)
3485                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3486               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3487               || (c == ISO_CODE_CSI
3488                   && (src < src_end
3489                       && (*src == ']'
3490                           || ((*src == '0' || *src == '1' || *src == '2')
3491                               && src + 1 < src_end
3492                               && src[1] == ']')))))
3493             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3494                      | CODING_CATEGORY_MASK_ISO_8BIT);
3495         }
3496       else
3497         /* C is a character of ISO2022 in graphic plane right,
3498            or a SJIS's 1-byte character code (i.e. JISX0201),
3499            or the first byte of BIG5's 2-byte code,
3500            or the first byte of UTF-8/16.  */
3501         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3502                 | CODING_CATEGORY_MASK_ISO_8BIT
3503                 | CODING_CATEGORY_MASK_SJIS
3504                 | CODING_CATEGORY_MASK_BIG5
3505                 | CODING_CATEGORY_MASK_UTF_8
3506                 | CODING_CATEGORY_MASK_UTF_16_BE
3507                 | CODING_CATEGORY_MASK_UTF_16_LE);
3508
3509       /* Or, we may have to consider the possibility of CCL.  */
3510       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3512               ->spec.ccl.valid_codes)[c])
3513         try |= CODING_CATEGORY_MASK_CCL;
3514
3515       mask = 0;
3516       utf16_examined_p = iso2022_examined_p = 0;
3517       if (priorities)
3518         {
3519           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3520             {
3521               if (!iso2022_examined_p
3522                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3523                 {
3524                   mask |= detect_coding_iso2022 (src, src_end);
3525                   iso2022_examined_p = 1;
3526                 }
3527               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3528                 mask |= detect_coding_sjis (src, src_end);
3529               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3530                 mask |= detect_coding_utf_8 (src, src_end);
3531               else if (!utf16_examined_p
3532                        && (priorities[i] & try &
3533                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3534                 {
3535                   mask |= detect_coding_utf_16 (src, src_end);
3536                   utf16_examined_p = 1;
3537                 }
3538               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3539                 mask |= detect_coding_big5 (src, src_end);
3540               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3541                 mask |= detect_coding_emacs_mule (src, src_end);
3542               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3543                 mask |= detect_coding_ccl (src, src_end);
3544               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3545                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3546               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3547                 mask |= CODING_CATEGORY_MASK_BINARY;
3548               if (mask & priorities[i])
3549                 return priorities[i];
3550             }
3551           return CODING_CATEGORY_MASK_RAW_TEXT;
3552         }
3553       if (try & CODING_CATEGORY_MASK_ISO)
3554         mask |= detect_coding_iso2022 (src, src_end);
3555       if (try & CODING_CATEGORY_MASK_SJIS)
3556         mask |= detect_coding_sjis (src, src_end);
3557       if (try & CODING_CATEGORY_MASK_BIG5)
3558         mask |= detect_coding_big5 (src, src_end);
3559       if (try & CODING_CATEGORY_MASK_UTF_8)
3560         mask |= detect_coding_utf_8 (src, src_end);
3561       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3562         mask |= detect_coding_utf_16 (src, src_end);
3563       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3564         mask |= detect_coding_emacs_mule (src, src_end);
3565       if (try & CODING_CATEGORY_MASK_CCL)
3566         mask |= detect_coding_ccl (src, src_end);
3567     }
3568   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3569 }
3570
3571 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3572    The information of the detected coding system is set in CODING.  */
3573
3574 void
3575 detect_coding (coding, src, src_bytes)
3576      struct coding_system *coding;
3577      unsigned char *src;
3578      int src_bytes;
3579 {
3580   unsigned int idx;
3581   int skip, mask, i;
3582   Lisp_Object val;
3583
3584   val = Vcoding_category_list;
3585   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3586   coding->heading_ascii = skip;
3587
3588   if (!mask) return;
3589
3590   /* We found a single coding system of the highest priority in MASK.  */
3591   idx = 0;
3592   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3593   if (! mask)
3594     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3595
3596   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3597
3598   if (coding->eol_type != CODING_EOL_UNDECIDED)
3599     {
3600       Lisp_Object tmp;
3601
3602       tmp = Fget (val, Qeol_type);
3603       if (VECTORP (tmp))
3604         val = XVECTOR (tmp)->contents[coding->eol_type];
3605     }
3606
3607   /* Setup this new coding system while preserving some slots.  */
3608   {
3609     int src_multibyte = coding->src_multibyte;
3610     int dst_multibyte = coding->dst_multibyte;
3611
3612     setup_coding_system (val, coding);
3613     coding->src_multibyte = src_multibyte;
3614     coding->dst_multibyte = dst_multibyte;
3615     coding->heading_ascii = skip;
3616   }
3617 }
3618
3619 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3620    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3621    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3622
3623    How many non-eol characters are at the head is returned as *SKIP.  */
3624
3625 #define MAX_EOL_CHECK_COUNT 3
3626
3627 static int
3628 detect_eol_type (source, src_bytes, skip)
3629      unsigned char *source;
3630      int src_bytes, *skip;
3631 {
3632   unsigned char *src = source, *src_end = src + src_bytes;
3633   unsigned char c;
3634   int total = 0;                /* How many end-of-lines are found so far.  */
3635   int eol_type = CODING_EOL_UNDECIDED;
3636   int this_eol_type;
3637
3638   *skip = 0;
3639
3640   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3641     {
3642       c = *src++;
3643       if (c == '\n' || c == '\r')
3644         {
3645           if (*skip == 0)
3646             *skip = src - 1 - source;
3647           total++;
3648           if (c == '\n')
3649             this_eol_type = CODING_EOL_LF;
3650           else if (src >= src_end || *src != '\n')
3651             this_eol_type = CODING_EOL_CR;
3652           else
3653             this_eol_type = CODING_EOL_CRLF, src++;
3654
3655           if (eol_type == CODING_EOL_UNDECIDED)
3656             /* This is the first end-of-line.  */
3657             eol_type = this_eol_type;
3658           else if (eol_type != this_eol_type)
3659             {
3660               /* The found type is different from what found before.  */
3661               eol_type = CODING_EOL_INCONSISTENT;
3662               break;
3663             }
3664         }
3665     }
3666
3667   if (*skip == 0)
3668     *skip = src_end - source;
3669   return eol_type;
3670 }
3671
3672 /* Like detect_eol_type, but detect EOL type in 2-octet
3673    big-endian/little-endian format for coding systems utf-16-be and
3674    utf-16-le.  */
3675
3676 static int
3677 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3678      unsigned char *source;
3679      int src_bytes, *skip;
3680 {
3681   unsigned char *src = source, *src_end = src + src_bytes;
3682   unsigned int c1, c2;
3683   int total = 0;                /* How many end-of-lines are found so far.  */
3684   int eol_type = CODING_EOL_UNDECIDED;
3685   int this_eol_type;
3686   int msb, lsb;
3687
3688   if (big_endian_p)
3689     msb = 0, lsb = 1;
3690   else
3691     msb = 1, lsb = 0;
3692
3693   *skip = 0;
3694
3695   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3696     {
3697       c1 = (src[msb] << 8) | (src[lsb]);
3698       src += 2;
3699
3700       if (c1 == '\n' || c1 == '\r')
3701         {
3702           if (*skip == 0)
3703             *skip = src - 2 - source;
3704           total++;
3705           if (c1 == '\n')
3706             {
3707               this_eol_type = CODING_EOL_LF;
3708             }
3709           else
3710             {
3711               if ((src + 1) >= src_end)
3712                 {
3713                   this_eol_type = CODING_EOL_CR;
3714                 }
3715               else
3716                 {
3717                   c2 = (src[msb] << 8) | (src[lsb]);
3718                   if (c2 == '\n')
3719                     this_eol_type = CODING_EOL_CRLF, src += 2;
3720                   else
3721                     this_eol_type = CODING_EOL_CR;
3722                 }
3723             }
3724
3725           if (eol_type == CODING_EOL_UNDECIDED)
3726             /* This is the first end-of-line.  */
3727             eol_type = this_eol_type;
3728           else if (eol_type != this_eol_type)
3729             {
3730               /* The found type is different from what found before.  */
3731               eol_type = CODING_EOL_INCONSISTENT;
3732               break;
3733             }
3734         }
3735     }
3736
3737   if (*skip == 0)
3738     *skip = src_end - source;
3739   return eol_type;
3740 }
3741
3742 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3743    is encoded.  If it detects an appropriate format of end-of-line, it
3744    sets the information in *CODING.  */
3745
3746 void
3747 detect_eol (coding, src, src_bytes)
3748      struct coding_system *coding;
3749      unsigned char *src;
3750      int src_bytes;
3751 {
3752   Lisp_Object val;
3753   int skip;
3754   int eol_type;
3755
3756   switch (coding->category_idx)
3757     {
3758     case CODING_CATEGORY_IDX_UTF_16_BE:
3759       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3760       break;
3761     case CODING_CATEGORY_IDX_UTF_16_LE:
3762       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3763       break;
3764     default:
3765       eol_type = detect_eol_type (src, src_bytes, &skip);
3766       break;
3767     }
3768
3769   if (coding->heading_ascii > skip)
3770     coding->heading_ascii = skip;
3771   else
3772     skip = coding->heading_ascii;
3773
3774   if (eol_type == CODING_EOL_UNDECIDED)
3775     return;
3776   if (eol_type == CODING_EOL_INCONSISTENT)
3777     {
3778 #if 0
3779       /* This code is suppressed until we find a better way to
3780          distinguish raw text file and binary file.  */
3781
3782       /* If we have already detected that the coding is raw-text, the
3783          coding should actually be no-conversion.  */
3784       if (coding->type == coding_type_raw_text)
3785         {
3786           setup_coding_system (Qno_conversion, coding);
3787           return;
3788         }
3789       /* Else, let's decode only text code anyway.  */
3790 #endif /* 0 */
3791       eol_type = CODING_EOL_LF;
3792     }
3793
3794   val = Fget (coding->symbol, Qeol_type);
3795   if (VECTORP (val) && XVECTOR (val)->size == 3)
3796     {
3797       int src_multibyte = coding->src_multibyte;
3798       int dst_multibyte = coding->dst_multibyte;
3799
3800       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3801       coding->src_multibyte = src_multibyte;
3802       coding->dst_multibyte = dst_multibyte;
3803       coding->heading_ascii = skip;
3804     }
3805 }
3806
3807 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3808
3809 #define DECODING_BUFFER_MAG(coding)                     \
3810   (coding->type == coding_type_iso2022                  \
3811    ? 3                                                  \
3812    : (coding->type == coding_type_ccl                   \
3813       ? coding->spec.ccl.decoder.buf_magnification      \
3814       : 2))
3815
3816 /* Return maximum size (bytes) of a buffer enough for decoding
3817    SRC_BYTES of text encoded in CODING.  */
3818
3819 int
3820 decoding_buffer_size (coding, src_bytes)
3821      struct coding_system *coding;
3822      int src_bytes;
3823 {
3824   return (src_bytes * DECODING_BUFFER_MAG (coding)
3825           + CONVERSION_BUFFER_EXTRA_ROOM);
3826 }
3827
3828 /* Return maximum size (bytes) of a buffer enough for encoding
3829    SRC_BYTES of text to CODING.  */
3830
3831 int
3832 encoding_buffer_size (coding, src_bytes)
3833      struct coding_system *coding;
3834      int src_bytes;
3835 {
3836   int magnification;
3837
3838   if (coding->type == coding_type_ccl)
3839     magnification = coding->spec.ccl.encoder.buf_magnification;
3840   else if (CODING_REQUIRE_ENCODING (coding))
3841     magnification = 3;
3842   else
3843     magnification = 1;
3844
3845   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3846 }
3847
3848 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3849 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3850 #endif
3851
3852 char *conversion_buffer;
3853 int conversion_buffer_size;
3854
3855 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3856    or decoding.  Sufficient memory is allocated automatically.  If we
3857    run out of memory, return NULL.  */
3858
3859 char *
3860 get_conversion_buffer (size)
3861      int size;
3862 {
3863   if (size > conversion_buffer_size)
3864     {
3865       char *buf;
3866       int real_size = conversion_buffer_size * 2;
3867
3868       while (real_size < size) real_size *= 2;
3869       buf = (char *) xmalloc (real_size);
3870       xfree (conversion_buffer);
3871       conversion_buffer = buf;
3872       conversion_buffer_size = real_size;
3873     }
3874   return conversion_buffer;
3875 }
3876
3877 int
3878 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3879      struct coding_system *coding;
3880      unsigned char *source, *destination;
3881      int src_bytes, dst_bytes, encodep;
3882 {
3883   struct ccl_program *ccl
3884     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3885   int result;
3886
3887   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3888
3889   coding->produced = ccl_driver (ccl, source, destination,
3890                                  src_bytes, dst_bytes, &(coding->consumed));
3891   if (encodep)
3892     coding->produced_char = coding->produced;
3893   else
3894     {
3895       int bytes
3896         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3897       coding->produced = str_as_multibyte (destination, bytes,
3898                                            coding->produced,
3899                                            &(coding->produced_char));
3900     }
3901
3902   switch (ccl->status)
3903     {
3904     case CCL_STAT_SUSPEND_BY_SRC:
3905       result = CODING_FINISH_INSUFFICIENT_SRC;
3906       break;
3907     case CCL_STAT_SUSPEND_BY_DST:
3908       result = CODING_FINISH_INSUFFICIENT_DST;
3909       break;
3910     case CCL_STAT_QUIT:
3911     case CCL_STAT_INVALID_CMD:
3912       result = CODING_FINISH_INTERRUPT;
3913       break;
3914     default:
3915       result = CODING_FINISH_NORMAL;
3916       break;
3917     }
3918   return result;
3919 }
3920
3921 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
3922    decoding, it may detect coding system and format of end-of-line if
3923    those are not yet decided.  The source should be unibyte, the
3924    result is multibyte if CODING->dst_multibyte is nonzero, else
3925    unibyte.  */
3926
3927 int
3928 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3929      struct coding_system *coding;
3930      unsigned char *source, *destination;
3931      int src_bytes, dst_bytes;
3932 {
3933   if (coding->type == coding_type_undecided)
3934     detect_coding (coding, source, src_bytes);
3935
3936   if (coding->eol_type == CODING_EOL_UNDECIDED)
3937     detect_eol (coding, source, src_bytes);
3938
3939   coding->produced = coding->produced_char = 0;
3940   coding->consumed = coding->consumed_char = 0;
3941   coding->errors = 0;
3942   coding->result = CODING_FINISH_NORMAL;
3943
3944   switch (coding->type)
3945     {
3946     case coding_type_sjis:
3947       decode_coding_sjis_big5 (coding, source, destination,
3948                                src_bytes, dst_bytes, 1);
3949       break;
3950
3951     case coding_type_iso2022:
3952       decode_coding_iso2022 (coding, source, destination,
3953                              src_bytes, dst_bytes);
3954       break;
3955
3956     case coding_type_big5:
3957       decode_coding_sjis_big5 (coding, source, destination,
3958                                src_bytes, dst_bytes, 0);
3959       break;
3960
3961     case coding_type_emacs_mule:
3962       decode_coding_emacs_mule (coding, source, destination,
3963                                 src_bytes, dst_bytes);
3964       break;
3965
3966     case coding_type_ccl:
3967       ccl_coding_driver (coding, source, destination,
3968                          src_bytes, dst_bytes, 0);
3969       break;
3970
3971     default:
3972       decode_eol (coding, source, destination, src_bytes, dst_bytes);
3973     }
3974
3975   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
3976       && coding->consumed == src_bytes)
3977     coding->result = CODING_FINISH_NORMAL;
3978
3979   if (coding->mode & CODING_MODE_LAST_BLOCK
3980       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
3981     {
3982       unsigned char *src = source + coding->consumed;
3983       unsigned char *dst = destination + coding->produced;
3984
3985       src_bytes -= coding->consumed;
3986      coding->errors++;
3987       if (COMPOSING_P (coding))
3988         DECODE_COMPOSITION_END ('1');
3989       while (src_bytes--)
3990         {
3991           int c = *src++;
3992           dst += CHAR_STRING (c, dst);
3993           coding->produced_char++;
3994         }
3995       coding->consumed = coding->consumed_char = src - source;
3996       coding->produced = dst - destination;
3997     }
3998
3999   if (!coding->dst_multibyte)
4000     {
4001       coding->produced = str_as_unibyte (destination, coding->produced);
4002       coding->produced_char = coding->produced;
4003     }
4004
4005   return coding->result;
4006 }
4007
4008 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4009    multibyteness of the source is CODING->src_multibyte, the
4010    multibyteness of the result is always unibyte.  */
4011
4012 int
4013 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4014      struct coding_system *coding;
4015      unsigned char *source, *destination;
4016      int src_bytes, dst_bytes;
4017 {
4018   coding->produced = coding->produced_char = 0;
4019   coding->consumed = coding->consumed_char = 0;
4020   coding->errors = 0;
4021   coding->result = CODING_FINISH_NORMAL;
4022
4023   switch (coding->type)
4024     {
4025     case coding_type_sjis:
4026       encode_coding_sjis_big5 (coding, source, destination,
4027                                src_bytes, dst_bytes, 1);
4028       break;
4029
4030     case coding_type_iso2022:
4031       encode_coding_iso2022 (coding, source, destination,
4032                              src_bytes, dst_bytes);
4033       break;
4034
4035     case coding_type_big5:
4036       encode_coding_sjis_big5 (coding, source, destination,
4037                                src_bytes, dst_bytes, 0);
4038       break;
4039
4040     case coding_type_emacs_mule:
4041       encode_coding_emacs_mule (coding, source, destination,
4042                                 src_bytes, dst_bytes);
4043       break;
4044
4045     case coding_type_ccl:
4046       ccl_coding_driver (coding, source, destination,
4047                          src_bytes, dst_bytes, 1);
4048       break;
4049
4050     default:
4051       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4052     }
4053
4054   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4055       && coding->consumed == src_bytes)
4056     coding->result = CODING_FINISH_NORMAL;
4057
4058   if (coding->mode & CODING_MODE_LAST_BLOCK)
4059     {
4060       unsigned char *src = source + coding->consumed;
4061       unsigned char *src_end = src + src_bytes;
4062       unsigned char *dst = destination + coding->produced;
4063
4064       if (coding->type == coding_type_iso2022)
4065         ENCODE_RESET_PLANE_AND_REGISTER;
4066       if (COMPOSING_P (coding))
4067         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4068       if (coding->consumed < src_bytes)
4069         {
4070           int len = src_bytes - coding->consumed;
4071
4072           BCOPY_SHORT (source + coding->consumed, dst, len);
4073           if (coding->src_multibyte)
4074             len = str_as_unibyte (dst, len);
4075           dst += len;
4076           coding->consumed = src_bytes;
4077         }
4078       coding->produced = coding->produced_char = dst - destination;
4079     }
4080
4081   return coding->result;
4082 }
4083
4084 /* Scan text in the region between *BEG and *END (byte positions),
4085    skip characters which we don't have to decode by coding system
4086    CODING at the head and tail, then set *BEG and *END to the region
4087    of the text we actually have to convert.  The caller should move
4088    the gap out of the region in advance if the region is from a
4089    buffer.
4090
4091    If STR is not NULL, *BEG and *END are indices into STR.  */
4092
4093 static void
4094 shrink_decoding_region (beg, end, coding, str)
4095      int *beg, *end;
4096      struct coding_system *coding;
4097      unsigned char *str;
4098 {
4099   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4100   int eol_conversion;
4101   Lisp_Object translation_table;
4102
4103   if (coding->type == coding_type_ccl
4104       || coding->type == coding_type_undecided
4105       || coding->eol_type != CODING_EOL_LF
4106       || !NILP (coding->post_read_conversion)
4107       || coding->composing != COMPOSITION_DISABLED)
4108     {
4109       /* We can't skip any data.  */
4110       return;
4111     }
4112   if (coding->type == coding_type_no_conversion
4113       || coding->type == coding_type_raw_text
4114       || coding->type == coding_type_emacs_mule)
4115     {
4116       /* We need no conversion, but don't have to skip any data here.
4117          Decoding routine handles them effectively anyway.  */
4118       return;
4119     }
4120
4121   translation_table = coding->translation_table_for_decode;
4122   if (NILP (translation_table) && !NILP (Venable_character_translation))
4123     translation_table = Vstandard_translation_table_for_decode;
4124   if (CHAR_TABLE_P (translation_table))
4125     {
4126       int i;
4127       for (i = 0; i < 128; i++)
4128         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4129           break;
4130       if (i < 128)
4131         /* Some ASCII character should be tranlsated.  We give up
4132            shrinking.  */
4133         return;
4134     }
4135
4136   if (coding->heading_ascii >= 0)
4137     /* Detection routine has already found how much we can skip at the
4138        head.  */
4139     *beg += coding->heading_ascii;
4140
4141   if (str)
4142     {
4143       begp_orig = begp = str + *beg;
4144       endp_orig = endp = str + *end;
4145     }
4146   else
4147     {
4148       begp_orig = begp = BYTE_POS_ADDR (*beg);
4149       endp_orig = endp = begp + *end - *beg;
4150     }
4151
4152   switch (coding->type)
4153     {
4154     case coding_type_sjis:
4155     case coding_type_big5:
4156       /* We can skip all ASCII characters at the head.  */
4157       if (coding->heading_ascii < 0)
4158         {
4159           if (eol_conversion)
4160             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4161           else
4162             while (begp < endp && *begp < 0x80) begp++;
4163         }
4164       /* We can skip all ASCII characters at the tail except for the
4165          second byte of SJIS or BIG5 code.  */
4166       if (eol_conversion)
4167         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4168       else
4169         while (begp < endp && endp[-1] < 0x80) endp--;
4170       /* Do not consider LF as ascii if preceded by CR, since that
4171          confuses eol decoding. */
4172       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4173         endp++;
4174       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4175         endp++;
4176       break;
4177
4178     case coding_type_iso2022:
4179       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4180         /* We can't skip any data.  */
4181         break;
4182       if (coding->heading_ascii < 0)
4183         {
4184           /* We can skip all ASCII characters at the head except for a
4185              few control codes.  */
4186           while (begp < endp && (c = *begp) < 0x80
4187                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4188                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4189                  && (!eol_conversion || c != ISO_CODE_LF))
4190             begp++;
4191         }
4192       switch (coding->category_idx)
4193         {
4194         case CODING_CATEGORY_IDX_ISO_8_1:
4195         case CODING_CATEGORY_IDX_ISO_8_2:
4196           /* We can skip all ASCII characters at the tail.  */
4197           if (eol_conversion)
4198             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4199           else
4200             while (begp < endp && endp[-1] < 0x80) endp--;
4201           /* Do not consider LF as ascii if preceded by CR, since that
4202              confuses eol decoding. */
4203           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4204             endp++;
4205           break;
4206
4207         case CODING_CATEGORY_IDX_ISO_7:
4208         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4209           {
4210             /* We can skip all charactes at the tail except for 8-bit
4211                codes and ESC and the following 2-byte at the tail.  */
4212             unsigned char *eight_bit = NULL;
4213
4214             if (eol_conversion)
4215               while (begp < endp
4216                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4217                 {
4218                   if (!eight_bit && c & 0x80) eight_bit = endp;
4219                   endp--;
4220                 }
4221             else
4222               while (begp < endp
4223                      && (c = endp[-1]) != ISO_CODE_ESC)
4224                 {
4225                   if (!eight_bit && c & 0x80) eight_bit = endp;
4226                   endp--;
4227                 }
4228             /* Do not consider LF as ascii if preceded by CR, since that
4229                confuses eol decoding. */
4230             if (begp < endp && endp < endp_orig
4231                 && endp[-1] == '\r' && endp[0] == '\n')
4232               endp++;
4233             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4234               {
4235                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4236                   /* This is an ASCII designation sequence.  We can
4237                      surely skip the tail.  But, if we have
4238                      encountered an 8-bit code, skip only the codes
4239                      after that.  */
4240                   endp = eight_bit ? eight_bit : endp + 2;
4241                 else
4242                   /* Hmmm, we can't skip the tail.  */
4243                   endp = endp_orig;
4244               }
4245             else if (eight_bit)
4246               endp = eight_bit;
4247           }
4248         }
4249       break;
4250
4251     default:
4252       abort ();
4253     }
4254   *beg += begp - begp_orig;
4255   *end += endp - endp_orig;
4256   return;
4257 }
4258
4259 /* Like shrink_decoding_region but for encoding.  */
4260
4261 static void
4262 shrink_encoding_region (beg, end, coding, str)
4263      int *beg, *end;
4264      struct coding_system *coding;
4265      unsigned char *str;
4266 {
4267   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4268   int eol_conversion;
4269   Lisp_Object translation_table;
4270
4271   if (coding->type == coding_type_ccl
4272       || coding->eol_type == CODING_EOL_CRLF
4273       || coding->eol_type == CODING_EOL_CR
4274       || coding->cmp_data && coding->cmp_data->used > 0)
4275     {
4276       /* We can't skip any data.  */
4277       return;
4278     }
4279   if (coding->type == coding_type_no_conversion
4280       || coding->type == coding_type_raw_text
4281       || coding->type == coding_type_emacs_mule
4282       || coding->type == coding_type_undecided)
4283     {
4284       /* We need no conversion, but don't have to skip any data here.
4285          Encoding routine handles them effectively anyway.  */
4286       return;
4287     }
4288
4289   translation_table = coding->translation_table_for_encode;
4290   if (NILP (translation_table) && !NILP (Venable_character_translation))
4291     translation_table = Vstandard_translation_table_for_encode;
4292   if (CHAR_TABLE_P (translation_table))
4293     {
4294       int i;
4295       for (i = 0; i < 128; i++)
4296         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4297           break;
4298       if (i < 128)
4299         /* Some ASCII character should be tranlsated.  We give up
4300            shrinking.  */
4301         return;
4302     }
4303
4304   if (str)
4305     {
4306       begp_orig = begp = str + *beg;
4307       endp_orig = endp = str + *end;
4308     }
4309   else
4310     {
4311       begp_orig = begp = BYTE_POS_ADDR (*beg);
4312       endp_orig = endp = begp + *end - *beg;
4313     }
4314
4315   eol_conversion = (coding->eol_type == CODING_EOL_CR
4316                     || coding->eol_type == CODING_EOL_CRLF);
4317
4318   /* Here, we don't have to check coding->pre_write_conversion because
4319      the caller is expected to have handled it already.  */
4320   switch (coding->type)
4321     {
4322     case coding_type_iso2022:
4323       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4324         /* We can't skip any data.  */
4325         break;
4326       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4327         {
4328           unsigned char *bol = begp;
4329           while (begp < endp && *begp < 0x80)
4330             {
4331               begp++;
4332               if (begp[-1] == '\n')
4333                 bol = begp;
4334             }
4335           begp = bol;
4336           goto label_skip_tail;
4337         }
4338       /* fall down ... */
4339
4340     case coding_type_sjis:
4341     case coding_type_big5:
4342       /* We can skip all ASCII characters at the head and tail.  */
4343       if (eol_conversion)
4344         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4345       else
4346         while (begp < endp && *begp < 0x80) begp++;
4347     label_skip_tail:
4348       if (eol_conversion)
4349         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4350       else
4351         while (begp < endp && *(endp - 1) < 0x80) endp--;
4352       break;
4353
4354     default:
4355       abort ();
4356     }
4357
4358   *beg += begp - begp_orig;
4359   *end += endp - endp_orig;
4360   return;
4361 }
4362
4363 /* As shrinking conversion region requires some overhead, we don't try
4364    shrinking if the length of conversion region is less than this
4365    value.  */
4366 static int shrink_conversion_region_threshhold = 1024;
4367
4368 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4369   do {                                                                  \
4370     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4371       {                                                                 \
4372         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4373         else shrink_decoding_region (beg, end, coding, str);            \
4374       }                                                                 \
4375   } while (0)
4376
4377 static Lisp_Object
4378 code_convert_region_unwind (dummy)
4379      Lisp_Object dummy;
4380 {
4381   inhibit_pre_post_conversion = 0;
4382   return Qnil;
4383 }
4384
4385 /* Store information about all compositions in the range FROM and TO
4386    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4387    buffer or a string, defaults to the current buffer.  */
4388
4389 void
4390 coding_save_composition (coding, from, to, obj)
4391      struct coding_system *coding;
4392      int from, to;
4393      Lisp_Object obj;
4394 {
4395   Lisp_Object prop;
4396   int start, end;
4397
4398   if (coding->composing == COMPOSITION_DISABLED)
4399     return;
4400   if (!coding->cmp_data)
4401     coding_allocate_composition_data (coding, from);
4402   if (!find_composition (from, to, &start, &end, &prop, obj)
4403       || end > to)
4404     return;
4405   if (start < from
4406       && (!find_composition (end, to, &start, &end, &prop, obj)
4407           || end > to))
4408     return;
4409   coding->composing = COMPOSITION_NO;
4410   do
4411     {
4412       if (COMPOSITION_VALID_P (start, end, prop))
4413         {
4414           enum composition_method method = COMPOSITION_METHOD (prop);
4415           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4416               >= COMPOSITION_DATA_SIZE)
4417             coding_allocate_composition_data (coding, from);
4418           /* For relative composition, we remember start and end
4419              positions, for the other compositions, we also remember
4420              components.  */
4421           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4422           if (method != COMPOSITION_RELATIVE)
4423             {
4424               /* We must store a*/
4425               Lisp_Object val, ch;
4426
4427               val = COMPOSITION_COMPONENTS (prop);
4428               if (CONSP (val))
4429                 while (CONSP (val))
4430                   {
4431                     ch = XCAR (val), val = XCDR (val);
4432                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4433                   }
4434               else if (VECTORP (val) || STRINGP (val))
4435                 {
4436                   int len = (VECTORP (val)
4437                              ? XVECTOR (val)->size : XSTRING (val)->size);
4438                   int i;
4439                   for (i = 0; i < len; i++)
4440                     {
4441                       ch = (STRINGP (val)
4442                             ? Faref (val, make_number (i))
4443                             : XVECTOR (val)->contents[i]);
4444                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4445                     }
4446                 }
4447               else              /* INTEGERP (val) */
4448                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4449             }
4450           CODING_ADD_COMPOSITION_END (coding, end - from);
4451         }
4452       start = end;
4453     }
4454   while (start < to
4455          && find_composition (start, to, &start, &end, &prop, obj)
4456          && end <= to);
4457
4458   /* Make coding->cmp_data point to the first memory block.  */
4459   while (coding->cmp_data->prev)
4460     coding->cmp_data = coding->cmp_data->prev;
4461   coding->cmp_data_start = 0;
4462 }
4463
4464 /* Reflect the saved information about compositions to OBJ.
4465    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4466    is a buffer or a string, defaults to the current buffer.  */
4467
4468 static void
4469 coding_restore_composition (coding, obj)
4470      struct coding_system *coding;
4471      Lisp_Object obj;
4472 {
4473   struct composition_data *cmp_data = coding->cmp_data;
4474
4475   if (!cmp_data)
4476     return;
4477
4478   while (cmp_data->prev)
4479     cmp_data = cmp_data->prev;
4480
4481   while (cmp_data)
4482     {
4483       int i;
4484
4485       for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4486         {
4487           int *data = cmp_data->data + i;
4488           enum composition_method method = (enum composition_method) data[3];
4489           Lisp_Object components;
4490
4491           if (method == COMPOSITION_RELATIVE)
4492             components = Qnil;
4493           else
4494             {
4495               int len = data[0] - 4, j;
4496               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4497
4498               for (j = 0; j < len; j++)
4499                 args[j] = make_number (data[4 + j]);
4500               components = (method == COMPOSITION_WITH_ALTCHARS
4501                             ? Fstring (len, args) : Fvector (len, args));
4502             }
4503           compose_text (data[1], data[2], components, Qnil, obj);
4504         }
4505       cmp_data = cmp_data->next;
4506     }
4507 }
4508
4509 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4510    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4511    coding system CODING, and return the status code of code conversion
4512    (currently, this value has no meaning).
4513
4514    How many characters (and bytes) are converted to how many
4515    characters (and bytes) are recorded in members of the structure
4516    CODING.
4517
4518    If REPLACE is nonzero, we do various things as if the original text
4519    is deleted and a new text is inserted.  See the comments in
4520    replace_range (insdel.c) to know what we are doing.
4521
4522    If REPLACE is zero, it is assumed that the source text is unibyte.
4523    Otherwize, it is assumed that the source text is multibyte.  */
4524
4525 int
4526 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4527      int from, from_byte, to, to_byte, encodep, replace;
4528      struct coding_system *coding;
4529 {
4530   int len = to - from, len_byte = to_byte - from_byte;
4531   int require, inserted, inserted_byte;
4532   int head_skip, tail_skip, total_skip = 0;
4533   Lisp_Object saved_coding_symbol;
4534   int first = 1;
4535   unsigned char *src, *dst;
4536   Lisp_Object deletion;
4537   int orig_point = PT, orig_len = len;
4538   int prev_Z;
4539   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4540
4541   coding->src_multibyte = replace && multibyte_p;
4542   coding->dst_multibyte = multibyte_p;
4543
4544   deletion = Qnil;
4545   saved_coding_symbol = Qnil;
4546
4547   if (from < PT && PT < to)
4548     {
4549       TEMP_SET_PT_BOTH (from, from_byte);
4550       orig_point = from;
4551     }
4552
4553   if (replace)
4554     {
4555       int saved_from = from;
4556
4557       prepare_to_modify_buffer (from, to, &from);
4558       if (saved_from != from)
4559         {
4560           to = from + len;
4561           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4562           len_byte = to_byte - from_byte;
4563         }
4564     }
4565
4566   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4567     {
4568       /* We must detect encoding of text and eol format.  */
4569
4570       if (from < GPT && to > GPT)
4571         move_gap_both (from, from_byte);
4572       if (coding->type == coding_type_undecided)
4573         {
4574           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4575           if (coding->type == coding_type_undecided)
4576             /* It seems that the text contains only ASCII, but we
4577                should not left it undecided because the deeper
4578                decoding routine (decode_coding) tries to detect the
4579                encodings again in vain.  */
4580             coding->type = coding_type_emacs_mule;
4581         }
4582       if (coding->eol_type == CODING_EOL_UNDECIDED)
4583         {
4584           saved_coding_symbol = coding->symbol;
4585           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4586           if (coding->eol_type == CODING_EOL_UNDECIDED)
4587             coding->eol_type = CODING_EOL_LF;
4588           /* We had better recover the original eol format if we
4589              encounter an inconsitent eol format while decoding.  */
4590           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4591         }
4592     }
4593
4594   /* Now we convert the text.  */
4595
4596   /* For encoding, we must process pre-write-conversion in advance.  */
4597   if (! inhibit_pre_post_conversion
4598       && encodep
4599       && SYMBOLP (coding->pre_write_conversion)
4600       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4601     {
4602       /* The function in pre-write-conversion may put a new text in a
4603          new buffer.  */
4604       struct buffer *prev = current_buffer;
4605       Lisp_Object new;
4606       int count = specpdl_ptr - specpdl;
4607
4608       record_unwind_protect (code_convert_region_unwind, Qnil);
4609       /* We should not call any more pre-write/post-read-conversion
4610          functions while this pre-write-conversion is running.  */
4611       inhibit_pre_post_conversion = 1;
4612       call2 (coding->pre_write_conversion,
4613              make_number (from), make_number (to));
4614       inhibit_pre_post_conversion = 0;
4615       /* Discard the unwind protect.  */
4616       specpdl_ptr--;
4617
4618       if (current_buffer != prev)
4619         {
4620           len = ZV - BEGV;
4621           new = Fcurrent_buffer ();
4622           set_buffer_internal_1 (prev);
4623           del_range_2 (from, from_byte, to, to_byte, 0);
4624           TEMP_SET_PT_BOTH (from, from_byte);
4625           insert_from_buffer (XBUFFER (new), 1, len, 0);
4626           Fkill_buffer (new);
4627           if (orig_point >= to)
4628             orig_point += len - orig_len;
4629           else if (orig_point > from)
4630             orig_point = from;
4631           orig_len = len;
4632           to = from + len;
4633           from_byte = CHAR_TO_BYTE (from);
4634           to_byte = CHAR_TO_BYTE (to);
4635           len_byte = to_byte - from_byte;
4636           TEMP_SET_PT_BOTH (from, from_byte);
4637         }
4638     }
4639
4640   if (replace)
4641     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4642
4643   if (coding->composing != COMPOSITION_DISABLED)
4644     {
4645       if (encodep)
4646         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4647       else
4648         coding_allocate_composition_data (coding, from);
4649     }
4650
4651   /* Try to skip the heading and tailing ASCIIs.  */
4652   {
4653     int from_byte_orig = from_byte, to_byte_orig = to_byte;
4654
4655     if (from < GPT && GPT < to)
4656       move_gap_both (from, from_byte);
4657     SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4658     if (from_byte == to_byte
4659         && (encodep || NILP (coding->post_read_conversion))
4660         && ! CODING_REQUIRE_FLUSHING (coding))
4661       {
4662         coding->produced = len_byte;
4663         coding->produced_char = len;
4664         if (!replace)
4665           /* We must record and adjust for this new text now.  */
4666           adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4667         return 0;
4668       }
4669
4670     head_skip = from_byte - from_byte_orig;
4671     tail_skip = to_byte_orig - to_byte;
4672     total_skip = head_skip + tail_skip;
4673     from += head_skip;
4674     to -= tail_skip;
4675     len -= total_skip; len_byte -= total_skip;
4676   }
4677
4678   /* The code conversion routine can not preserve text properties for
4679      now.  So, we must remove all text properties in the region.
4680      Here, we must suppress all modification hooks.  */
4681   if (replace)
4682     {
4683       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4684       inhibit_modification_hooks = 1;
4685       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4686       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4687     }
4688
4689   /* For converion, we must put the gap before the text in addition to
4690      making the gap larger for efficient decoding.  The required gap
4691      size starts from 2000 which is the magic number used in make_gap.
4692      But, after one batch of conversion, it will be incremented if we
4693      find that it is not enough .  */
4694   require = 2000;
4695
4696   if (GAP_SIZE  < require)
4697     make_gap (require - GAP_SIZE);
4698   move_gap_both (from, from_byte);
4699
4700   inserted = inserted_byte = 0;
4701
4702   GAP_SIZE += len_byte;
4703   ZV -= len;
4704   Z -= len;
4705   ZV_BYTE -= len_byte;
4706   Z_BYTE -= len_byte;
4707
4708   if (GPT - BEG < BEG_UNCHANGED)
4709     BEG_UNCHANGED = GPT - BEG;
4710   if (Z - GPT < END_UNCHANGED)
4711     END_UNCHANGED = Z - GPT;
4712
4713   if (!encodep && coding->src_multibyte)
4714     {
4715       /* Decoding routines expects that the source text is unibyte.
4716          We must convert 8-bit characters of multibyte form to
4717          unibyte.  */
4718       int len_byte_orig = len_byte;
4719       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4720       if (len_byte < len_byte_orig)
4721         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4722                     len_byte);
4723       coding->src_multibyte = 0;
4724     }
4725
4726   for (;;)
4727     {
4728       int result;
4729
4730       /* The buffer memory is now:
4731          +--------+converted-text+---------+-------original-text-------+---+
4732          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4733                   |<---------------------- GAP ----------------------->|  */
4734       src = GAP_END_ADDR - len_byte;
4735       dst = GPT_ADDR + inserted_byte;
4736
4737       if (encodep)
4738         result = encode_coding (coding, src, dst, len_byte, 0);
4739       else
4740         result = decode_coding (coding, src, dst, len_byte, 0);
4741
4742       /* The buffer memory is now:
4743          +--------+-------converted-text----+--+------original-text----+---+
4744          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4745                   |<---------------------- GAP ----------------------->|  */
4746
4747       inserted += coding->produced_char;
4748       inserted_byte += coding->produced;
4749       len_byte -= coding->consumed;
4750
4751       if (result == CODING_FINISH_INSUFFICIENT_CMP)
4752         {
4753           coding_allocate_composition_data (coding, from + inserted);
4754           continue;
4755         }
4756
4757       src += coding->consumed;
4758       dst += coding->produced;
4759
4760       if (result == CODING_FINISH_NORMAL)
4761         {
4762           src += len_byte;
4763           break;
4764         }
4765       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4766         {
4767           unsigned char *pend = dst, *p = pend - inserted_byte;
4768           Lisp_Object eol_type;
4769
4770           /* Encode LFs back to the original eol format (CR or CRLF).  */
4771           if (coding->eol_type == CODING_EOL_CR)
4772             {
4773               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4774             }
4775           else
4776             {
4777               int count = 0;
4778
4779               while (p < pend) if (*p++ == '\n') count++;
4780               if (src - dst < count)
4781                 {
4782                   /* We don't have sufficient room for encoding LFs
4783                      back to CRLF.  We must record converted and
4784                      not-yet-converted text back to the buffer
4785                      content, enlarge the gap, then record them out of
4786                      the buffer contents again.  */
4787                   int add = len_byte + inserted_byte;
4788
4789                   GAP_SIZE -= add;
4790                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4791                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4792                   make_gap (count - GAP_SIZE);
4793                   GAP_SIZE += add;
4794                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4795                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4796                   /* Don't forget to update SRC, DST, and PEND.  */
4797                   src = GAP_END_ADDR - len_byte;
4798                   dst = GPT_ADDR + inserted_byte;
4799                   pend = dst;
4800                 }
4801               inserted += count;
4802               inserted_byte += count;
4803               coding->produced += count;
4804               p = dst = pend + count;
4805               while (count)
4806                 {
4807                   *--p = *--pend;
4808                   if (*p == '\n') count--, *--p = '\r';
4809                 }
4810             }
4811
4812           /* Suppress eol-format conversion in the further conversion.  */
4813           coding->eol_type = CODING_EOL_LF;
4814
4815           /* Set the coding system symbol to that for Unix-like EOL.  */
4816           eol_type = Fget (saved_coding_symbol, Qeol_type);
4817           if (VECTORP (eol_type)
4818               && XVECTOR (eol_type)->size == 3
4819               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4820             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4821           else
4822             coding->symbol = saved_coding_symbol;
4823
4824           continue;
4825         }
4826       if (len_byte <= 0)
4827         {
4828           if (coding->type != coding_type_ccl
4829               || coding->mode & CODING_MODE_LAST_BLOCK)
4830             break;
4831           coding->mode |= CODING_MODE_LAST_BLOCK;
4832           continue;
4833         }
4834       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4835         {
4836           /* The source text ends in invalid codes.  Let's just
4837              make them valid buffer contents, and finish conversion.  */
4838           inserted += len_byte;
4839           inserted_byte += len_byte;
4840           while (len_byte--)
4841             *dst++ = *src++;
4842           break;
4843         }
4844       if (result == CODING_FINISH_INTERRUPT)
4845         {
4846           /* The conversion procedure was interrupted by a user.  */
4847           break;
4848         }
4849       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
4850       if (coding->consumed < 1)
4851         {
4852           /* It's quite strange to require more memory without
4853              consuming any bytes.  Perhaps CCL program bug.  */
4854           break;
4855         }
4856       if (first)
4857         {
4858           /* We have just done the first batch of conversion which was
4859              stoped because of insufficient gap.  Let's reconsider the
4860              required gap size (i.e. SRT - DST) now.
4861
4862              We have converted ORIG bytes (== coding->consumed) into
4863              NEW bytes (coding->produced).  To convert the remaining
4864              LEN bytes, we may need REQUIRE bytes of gap, where:
4865                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4866                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4867              Here, we are sure that NEW >= ORIG.  */
4868           float ratio = coding->produced - coding->consumed;
4869           ratio /= coding->consumed;
4870           require = len_byte * ratio;
4871           first = 0;
4872         }
4873       if ((src - dst) < (require + 2000))
4874         {
4875           /* See the comment above the previous call of make_gap.  */
4876           int add = len_byte + inserted_byte;
4877
4878           GAP_SIZE -= add;
4879           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4880           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4881           make_gap (require + 2000);
4882           GAP_SIZE += add;
4883           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4884           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4885         }
4886     }
4887   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
4888
4889   if (encodep && coding->dst_multibyte)
4890     {
4891       /* The output is unibyte.  We must convert 8-bit characters to
4892          multibyte form.  */
4893       if (inserted_byte * 2 > GAP_SIZE)
4894         {
4895           GAP_SIZE -= inserted_byte;
4896           ZV += inserted_byte; Z += inserted_byte;
4897           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4898           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4899           make_gap (inserted_byte - GAP_SIZE);
4900           GAP_SIZE += inserted_byte;
4901           ZV -= inserted_byte; Z -= inserted_byte;
4902           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
4903           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4904         }
4905       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
4906     }
4907
4908   /* If we have shrinked the conversion area, adjust it now.  */
4909   if (total_skip > 0)
4910     {
4911       if (tail_skip > 0)
4912         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4913       inserted += total_skip; inserted_byte += total_skip;
4914       GAP_SIZE += total_skip;
4915       GPT -= head_skip; GPT_BYTE -= head_skip;
4916       ZV -= total_skip; ZV_BYTE -= total_skip;
4917       Z -= total_skip; Z_BYTE -= total_skip;
4918       from -= head_skip; from_byte -= head_skip;
4919       to += tail_skip; to_byte += tail_skip;
4920     }
4921
4922   prev_Z = Z;
4923   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4924   inserted = Z - prev_Z;
4925
4926   if (!encodep && coding->cmp_data && coding->cmp_data->used)
4927     coding_restore_composition (coding, Fcurrent_buffer ());
4928   coding_free_composition_data (coding);
4929
4930   if (! inhibit_pre_post_conversion
4931       && ! encodep && ! NILP (coding->post_read_conversion))
4932     {
4933       Lisp_Object val;
4934       int count = specpdl_ptr - specpdl;
4935
4936       if (from != PT)
4937         TEMP_SET_PT_BOTH (from, from_byte);
4938       prev_Z = Z;
4939       record_unwind_protect (code_convert_region_unwind, Qnil);
4940       /* We should not call any more pre-write/post-read-conversion
4941          functions while this post-read-conversion is running.  */
4942       inhibit_pre_post_conversion = 1;
4943       val = call1 (coding->post_read_conversion, make_number (inserted));
4944       inhibit_pre_post_conversion = 0;
4945       /* Discard the unwind protect.  */
4946       specpdl_ptr--;
4947       CHECK_NUMBER (val, 0);
4948       inserted += Z - prev_Z;
4949     }
4950
4951   if (orig_point >= from)
4952     {
4953       if (orig_point >= from + orig_len)
4954         orig_point += inserted - orig_len;
4955       else
4956         orig_point = from;
4957       TEMP_SET_PT (orig_point);
4958     }
4959
4960   if (replace)
4961     {
4962       signal_after_change (from, to - from, inserted);
4963       update_compositions (from, from + inserted, CHECK_BORDER);
4964     }
4965
4966   {
4967     coding->consumed = to_byte - from_byte;
4968     coding->consumed_char = to - from;
4969     coding->produced = inserted_byte;
4970     coding->produced_char = inserted;
4971   }
4972
4973   return 0;
4974 }
4975
4976 Lisp_Object
4977 run_pre_post_conversion_on_str (str, coding, encodep)
4978      Lisp_Object str;
4979      struct coding_system *coding;
4980      int encodep;
4981 {
4982   int count = specpdl_ptr - specpdl;
4983   struct gcpro gcpro1;
4984   struct buffer *prev = current_buffer;
4985   int multibyte = STRING_MULTIBYTE (str);
4986
4987   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4988   record_unwind_protect (code_convert_region_unwind, Qnil);
4989   GCPRO1 (str);
4990   temp_output_buffer_setup (" *code-converting-work*");
4991   set_buffer_internal (XBUFFER (Vstandard_output));
4992   /* We must insert the contents of STR as is without
4993      unibyte<->multibyte conversion.  For that, we adjust the
4994      multibyteness of the working buffer to that of STR.  */
4995   Ferase_buffer ();
4996   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
4997   insert_from_string (str, 0, 0,
4998                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
4999   UNGCPRO;
5000   inhibit_pre_post_conversion = 1;
5001   if (encodep)
5002     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5003   else
5004     {
5005       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5006       call1 (coding->post_read_conversion, make_number (Z - BEG));
5007     }
5008   inhibit_pre_post_conversion = 0;
5009   str = make_buffer_string (BEG, Z, 0);
5010   return unbind_to (count, str);
5011 }
5012
5013 Lisp_Object
5014 decode_coding_string (str, coding, nocopy)
5015      Lisp_Object str;
5016      struct coding_system *coding;
5017      int nocopy;
5018 {
5019   int len;
5020   char *buf;
5021   int from, to, to_byte;
5022   struct gcpro gcpro1;
5023   Lisp_Object saved_coding_symbol;
5024   int result;
5025
5026   from = 0;
5027   to = XSTRING (str)->size;
5028   to_byte = STRING_BYTES (XSTRING (str));
5029
5030   saved_coding_symbol = Qnil;
5031   if (CODING_REQUIRE_DETECTION (coding))
5032     {
5033       /* See the comments in code_convert_region.  */
5034       if (coding->type == coding_type_undecided)
5035         {
5036           detect_coding (coding, XSTRING (str)->data, to_byte);
5037           if (coding->type == coding_type_undecided)
5038             coding->type = coding_type_emacs_mule;
5039         }
5040       if (coding->eol_type == CODING_EOL_UNDECIDED)
5041         {
5042           saved_coding_symbol = coding->symbol;
5043           detect_eol (coding, XSTRING (str)->data, to_byte);
5044           if (coding->eol_type == CODING_EOL_UNDECIDED)
5045             coding->eol_type = CODING_EOL_LF;
5046           /* We had better recover the original eol format if we
5047              encounter an inconsitent eol format while decoding.  */
5048           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5049         }
5050     }
5051
5052   if (! CODING_REQUIRE_DECODING (coding))
5053     {
5054       if (!STRING_MULTIBYTE (str))
5055         {
5056           str = Fstring_as_multibyte (str);
5057           nocopy = 1;
5058         }
5059       return (nocopy ? str : Fcopy_sequence (str));
5060     }
5061
5062   if (STRING_MULTIBYTE (str))
5063     {
5064       /* Decoding routines expect the source text to be unibyte.  */
5065       str = Fstring_as_unibyte (str);
5066       nocopy = 1;
5067       coding->src_multibyte = 0;
5068     }
5069   coding->dst_multibyte = 1;
5070
5071   if (coding->composing != COMPOSITION_DISABLED)
5072     coding_allocate_composition_data (coding, from);
5073
5074   /* Try to skip the heading and tailing ASCIIs.  */
5075   {
5076     int from_orig = from;
5077
5078     SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5079                               0);
5080     if (from == to_byte)
5081       return (nocopy ? str : Fcopy_sequence (str));
5082   }
5083
5084   len = decoding_buffer_size (coding, to_byte - from);
5085   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5086   GCPRO1 (str);
5087   buf = get_conversion_buffer (len);
5088   UNGCPRO;
5089
5090   if (from > 0)
5091     bcopy (XSTRING (str)->data, buf, from);
5092   result = decode_coding (coding, XSTRING (str)->data + from,
5093                          buf + from, to_byte - from, len);
5094   if (result == CODING_FINISH_INCONSISTENT_EOL)
5095     {
5096       /* We simply try to decode the whole string again but without
5097          eol-conversion this time.  */
5098       coding->eol_type = CODING_EOL_LF;
5099       coding->symbol = saved_coding_symbol;
5100       coding_free_composition_data (coding);
5101       return decode_coding_string (str, coding, nocopy);
5102     }
5103
5104   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5105          STRING_BYTES (XSTRING (str)) - to_byte);
5106
5107   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5108   str = make_multibyte_string (buf, len + coding->produced_char,
5109                                len + coding->produced);
5110
5111   if (coding->cmp_data && coding->cmp_data->used)
5112     coding_restore_composition (coding, str);
5113   coding_free_composition_data (coding);
5114
5115   if (SYMBOLP (coding->post_read_conversion)
5116       && !NILP (Ffboundp (coding->post_read_conversion)))
5117     str = run_pre_post_conversion_on_str (str, coding, 0);
5118
5119   return str;
5120 }
5121
5122 Lisp_Object
5123 encode_coding_string (str, coding, nocopy)
5124      Lisp_Object str;
5125      struct coding_system *coding;
5126      int nocopy;
5127 {
5128   int len;
5129   char *buf;
5130   int from, to, to_byte;
5131   struct gcpro gcpro1;
5132   Lisp_Object saved_coding_symbol;
5133   int result;
5134
5135   if (SYMBOLP (coding->pre_write_conversion)
5136       && !NILP (Ffboundp (coding->pre_write_conversion)))
5137     str = run_pre_post_conversion_on_str (str, coding, 1);
5138
5139   from = 0;
5140   to = XSTRING (str)->size;
5141   to_byte = STRING_BYTES (XSTRING (str));
5142
5143   saved_coding_symbol = Qnil;
5144   if (! CODING_REQUIRE_ENCODING (coding))
5145     {
5146       if (STRING_MULTIBYTE (str))
5147         {
5148           str = Fstring_as_unibyte (str);
5149           nocopy = 1;
5150         }
5151       return (nocopy ? str : Fcopy_sequence (str));
5152     }
5153
5154   /* Encoding routines determine the multibyteness of the source text
5155      by coding->src_multibyte.  */
5156   coding->src_multibyte = STRING_MULTIBYTE (str);
5157   coding->dst_multibyte = 0;
5158
5159   if (coding->composing != COMPOSITION_DISABLED)
5160     coding_save_composition (coding, from, to, str);
5161
5162   /* Try to skip the heading and tailing ASCIIs.  */
5163   {
5164     int from_orig = from;
5165
5166     SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5167                               1);
5168     if (from == to_byte)
5169       return (nocopy ? str : Fcopy_sequence (str));
5170   }
5171
5172   len = encoding_buffer_size (coding, to_byte - from);
5173   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5174   GCPRO1 (str);
5175   buf = get_conversion_buffer (len);
5176   UNGCPRO;
5177
5178   if (from > 0)
5179     bcopy (XSTRING (str)->data, buf, from);
5180   result = encode_coding (coding, XSTRING (str)->data + from,
5181                           buf + from, to_byte - from, len);
5182   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5183          STRING_BYTES (XSTRING (str)) - to_byte);
5184
5185   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5186   str = make_unibyte_string (buf, len + coding->produced);
5187   coding_free_composition_data (coding);
5188
5189   return str;
5190 }
5191
5192 \f
5193 #ifdef emacs
5194 /*** 8. Emacs Lisp library functions ***/
5195
5196 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5197   "Return t if OBJECT is nil or a coding-system.\n\
5198 See the documentation of `make-coding-system' for information\n\
5199 about coding-system objects.")
5200   (obj)
5201      Lisp_Object obj;
5202 {
5203   if (NILP (obj))
5204     return Qt;
5205   if (!SYMBOLP (obj))
5206     return Qnil;
5207   /* Get coding-spec vector for OBJ.  */
5208   obj = Fget (obj, Qcoding_system);
5209   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5210           ? Qt : Qnil);
5211 }
5212
5213 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5214        Sread_non_nil_coding_system, 1, 1, 0,
5215   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5216   (prompt)
5217      Lisp_Object prompt;
5218 {
5219   Lisp_Object val;
5220   do
5221     {
5222       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5223                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5224     }
5225   while (XSTRING (val)->size == 0);
5226   return (Fintern (val, Qnil));
5227 }
5228
5229 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5230   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5231 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5232   (prompt, default_coding_system)
5233      Lisp_Object prompt, default_coding_system;
5234 {
5235   Lisp_Object val;
5236   if (SYMBOLP (default_coding_system))
5237     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5238   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5239                           Qt, Qnil, Qcoding_system_history,
5240                           default_coding_system, Qnil);
5241   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5242 }
5243
5244 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5245        1, 1, 0,
5246   "Check validity of CODING-SYSTEM.\n\
5247 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5248 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5249 The value of property should be a vector of length 5.")
5250   (coding_system)
5251      Lisp_Object coding_system;
5252 {
5253   CHECK_SYMBOL (coding_system, 0);
5254   if (!NILP (Fcoding_system_p (coding_system)))
5255     return coding_system;
5256   while (1)
5257     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5258 }
5259 \f
5260 Lisp_Object
5261 detect_coding_system (src, src_bytes, highest)
5262      unsigned char *src;
5263      int src_bytes, highest;
5264 {
5265   int coding_mask, eol_type;
5266   Lisp_Object val, tmp;
5267   int dummy;
5268
5269   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5270   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5271   if (eol_type == CODING_EOL_INCONSISTENT)
5272     eol_type = CODING_EOL_UNDECIDED;
5273
5274   if (!coding_mask)
5275     {
5276       val = Qundecided;
5277       if (eol_type != CODING_EOL_UNDECIDED)
5278         {
5279           Lisp_Object val2;
5280           val2 = Fget (Qundecided, Qeol_type);
5281           if (VECTORP (val2))
5282             val = XVECTOR (val2)->contents[eol_type];
5283         }
5284       return (highest ? val : Fcons (val, Qnil));
5285     }
5286
5287   /* At first, gather possible coding systems in VAL.  */
5288   val = Qnil;
5289   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5290     {
5291       Lisp_Object category_val, category_index;
5292
5293       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5294       category_val = Fsymbol_value (XCAR (tmp));
5295       if (!NILP (category_val)
5296           && NATNUMP (category_index)
5297           && (coding_mask & (1 << XFASTINT (category_index))))
5298         {
5299           val = Fcons (category_val, val);
5300           if (highest)
5301             break;
5302         }
5303     }
5304   if (!highest)
5305     val = Fnreverse (val);
5306
5307   /* Then, replace the elements with subsidiary coding systems.  */
5308   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5309     {
5310       if (eol_type != CODING_EOL_UNDECIDED
5311           && eol_type != CODING_EOL_INCONSISTENT)
5312         {
5313           Lisp_Object eol;
5314           eol = Fget (XCAR (tmp), Qeol_type);
5315           if (VECTORP (eol))
5316             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5317         }
5318     }
5319   return (highest ? XCAR (val) : val);
5320 }
5321
5322 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5323        2, 3, 0,
5324   "Detect coding system of the text in the region between START and END.\n\
5325 Return a list of possible coding systems ordered by priority.\n\
5326 \n\
5327 If only ASCII characters are found, it returns a list of single element\n\
5328 `undecided' or its subsidiary coding system according to a detected\n\
5329 end-of-line format.\n\
5330 \n\
5331 If optional argument HIGHEST is non-nil, return the coding system of\n\
5332 highest priority.")
5333   (start, end, highest)
5334      Lisp_Object start, end, highest;
5335 {
5336   int from, to;
5337   int from_byte, to_byte;
5338
5339   CHECK_NUMBER_COERCE_MARKER (start, 0);
5340   CHECK_NUMBER_COERCE_MARKER (end, 1);
5341
5342   validate_region (&start, &end);
5343   from = XINT (start), to = XINT (end);
5344   from_byte = CHAR_TO_BYTE (from);
5345   to_byte = CHAR_TO_BYTE (to);
5346
5347   if (from < GPT && to >= GPT)
5348     move_gap_both (to, to_byte);
5349
5350   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5351                                to_byte - from_byte,
5352                                !NILP (highest));
5353 }
5354
5355 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5356        1, 2, 0,
5357   "Detect coding system of the text in STRING.\n\
5358 Return a list of possible coding systems ordered by priority.\n\
5359 \n\
5360 If only ASCII characters are found, it returns a list of single element\n\
5361 `undecided' or its subsidiary coding system according to a detected\n\
5362 end-of-line format.\n\
5363 \n\
5364 If optional argument HIGHEST is non-nil, return the coding system of\n\
5365 highest priority.")
5366   (string, highest)
5367      Lisp_Object string, highest;
5368 {
5369   CHECK_STRING (string, 0);
5370
5371   return detect_coding_system (XSTRING (string)->data,
5372                                STRING_BYTES (XSTRING (string)),
5373                                !NILP (highest));
5374 }
5375
5376 Lisp_Object
5377 code_convert_region1 (start, end, coding_system, encodep)
5378      Lisp_Object start, end, coding_system;
5379      int encodep;
5380 {
5381   struct coding_system coding;
5382   int from, to, len;
5383
5384   CHECK_NUMBER_COERCE_MARKER (start, 0);
5385   CHECK_NUMBER_COERCE_MARKER (end, 1);
5386   CHECK_SYMBOL (coding_system, 2);
5387
5388   validate_region (&start, &end);
5389   from = XFASTINT (start);
5390   to = XFASTINT (end);
5391
5392   if (NILP (coding_system))
5393     return make_number (to - from);
5394
5395   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5396     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5397
5398   coding.mode |= CODING_MODE_LAST_BLOCK;
5399   coding.src_multibyte = coding.dst_multibyte
5400     = !NILP (current_buffer->enable_multibyte_characters);
5401   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5402                        &coding, encodep, 1);
5403   Vlast_coding_system_used = coding.symbol;
5404   return make_number (coding.produced_char);
5405 }
5406
5407 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5408        3, 3, "r\nzCoding system: ",
5409   "Decode the current region by specified coding system.\n\
5410 When called from a program, takes three arguments:\n\
5411 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5412 This function sets `last-coding-system-used' to the precise coding system\n\
5413 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5414 not fully specified.)\n\
5415 It returns the length of the decoded text.")
5416   (start, end, coding_system)
5417      Lisp_Object start, end, coding_system;
5418 {
5419   return code_convert_region1 (start, end, coding_system, 0);
5420 }
5421
5422 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5423        3, 3, "r\nzCoding system: ",
5424   "Encode the current region by specified coding system.\n\
5425 When called from a program, takes three arguments:\n\
5426 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5427 This function sets `last-coding-system-used' to the precise coding system\n\
5428 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5429 not fully specified.)\n\
5430 It returns the length of the encoded text.")
5431   (start, end, coding_system)
5432      Lisp_Object start, end, coding_system;
5433 {
5434   return code_convert_region1 (start, end, coding_system, 1);
5435 }
5436
5437 Lisp_Object
5438 code_convert_string1 (string, coding_system, nocopy, encodep)
5439      Lisp_Object string, coding_system, nocopy;
5440      int encodep;
5441 {
5442   struct coding_system coding;
5443
5444   CHECK_STRING (string, 0);
5445   CHECK_SYMBOL (coding_system, 1);
5446
5447   if (NILP (coding_system))
5448     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5449
5450   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5451     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5452
5453   coding.mode |= CODING_MODE_LAST_BLOCK;
5454   string = (encodep
5455             ? encode_coding_string (string, &coding, !NILP (nocopy))
5456             : decode_coding_string (string, &coding, !NILP (nocopy)));
5457   Vlast_coding_system_used = coding.symbol;
5458
5459   return string;
5460 }
5461
5462 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5463        2, 3, 0,
5464   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5465 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5466 if the decoding operation is trivial.\n\
5467 This function sets `last-coding-system-used' to the precise coding system\n\
5468 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5469 not fully specified.)")
5470   (string, coding_system, nocopy)
5471      Lisp_Object string, coding_system, nocopy;
5472 {
5473   return code_convert_string1 (string, coding_system, nocopy, 0);
5474 }
5475
5476 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5477        2, 3, 0,
5478   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5479 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5480 if the encoding operation is trivial.\n\
5481 This function sets `last-coding-system-used' to the precise coding system\n\
5482 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5483 not fully specified.)")
5484   (string, coding_system, nocopy)
5485      Lisp_Object string, coding_system, nocopy;
5486 {
5487   return code_convert_string1 (string, coding_system, nocopy, 1);
5488 }
5489
5490 /* Encode or decode STRING according to CODING_SYSTEM.
5491    Do not set Vlast_coding_system_used.
5492
5493    This function is called only from macros DECODE_FILE and
5494    ENCODE_FILE, thus we ignore character composition.  */
5495
5496 Lisp_Object
5497 code_convert_string_norecord (string, coding_system, encodep)
5498      Lisp_Object string, coding_system;
5499      int encodep;
5500 {
5501   struct coding_system coding;
5502
5503   CHECK_STRING (string, 0);
5504   CHECK_SYMBOL (coding_system, 1);
5505
5506   if (NILP (coding_system))
5507     return string;
5508
5509   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5510     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5511
5512   coding.composing = COMPOSITION_DISABLED;
5513   coding.mode |= CODING_MODE_LAST_BLOCK;
5514   return (encodep
5515           ? encode_coding_string (string, &coding, 1)
5516           : decode_coding_string (string, &coding, 1));
5517 }
5518 \f
5519 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5520   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5521 Return the corresponding character.")
5522   (code)
5523      Lisp_Object code;
5524 {
5525   unsigned char c1, c2, s1, s2;
5526   Lisp_Object val;
5527
5528   CHECK_NUMBER (code, 0);
5529   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5530   if (s1 == 0)
5531     {
5532       if (s2 < 0x80)
5533         XSETFASTINT (val, s2);
5534       else if (s2 >= 0xA0 || s2 <= 0xDF)
5535         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5536       else
5537         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5538     }
5539   else
5540     {
5541       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5542           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5543         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5544       DECODE_SJIS (s1, s2, c1, c2);
5545       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5546     }
5547   return val;
5548 }
5549
5550 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5551   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5552 Return the corresponding code in SJIS.")
5553   (ch)
5554      Lisp_Object ch;
5555 {
5556   int charset, c1, c2, s1, s2;
5557   Lisp_Object val;
5558
5559   CHECK_NUMBER (ch, 0);
5560   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5561   if (charset == CHARSET_ASCII)
5562     {
5563       val = ch;
5564     }
5565   else if (charset == charset_jisx0208
5566            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5567     {
5568       ENCODE_SJIS (c1, c2, s1, s2);
5569       XSETFASTINT (val, (s1 << 8) | s2);
5570     }
5571   else if (charset == charset_katakana_jisx0201
5572            && c1 > 0x20 && c2 < 0xE0)
5573     {
5574       XSETFASTINT (val, c1 | 0x80);
5575     }
5576   else
5577     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5578   return val;
5579 }
5580
5581 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5582   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5583 Return the corresponding character.")
5584   (code)
5585      Lisp_Object code;
5586 {
5587   int charset;
5588   unsigned char b1, b2, c1, c2;
5589   Lisp_Object val;
5590
5591   CHECK_NUMBER (code, 0);
5592   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5593   if (b1 == 0)
5594     {
5595       if (b2 >= 0x80)
5596         error ("Invalid BIG5 code: %x", XFASTINT (code));
5597       val = code;
5598     }
5599   else
5600     {
5601       if ((b1 < 0xA1 || b1 > 0xFE)
5602           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5603         error ("Invalid BIG5 code: %x", XFASTINT (code));
5604       DECODE_BIG5 (b1, b2, charset, c1, c2);
5605       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5606     }
5607   return val;
5608 }
5609
5610 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5611   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5612 Return the corresponding character code in Big5.")
5613   (ch)
5614      Lisp_Object ch;
5615 {
5616   int charset, c1, c2, b1, b2;
5617   Lisp_Object val;
5618
5619   CHECK_NUMBER (ch, 0);
5620   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5621   if (charset == CHARSET_ASCII)
5622     {
5623       val = ch;
5624     }
5625   else if ((charset == charset_big5_1
5626             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5627            || (charset == charset_big5_2
5628                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5629     {
5630       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5631       XSETFASTINT (val, (b1 << 8) | b2);
5632     }
5633   else
5634     error ("Can't encode to Big5: %d", XFASTINT (ch));
5635   return val;
5636 }
5637 \f
5638 DEFUN ("set-terminal-coding-system-internal",
5639        Fset_terminal_coding_system_internal,
5640        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5641   (coding_system)
5642      Lisp_Object coding_system;
5643 {
5644   CHECK_SYMBOL (coding_system, 0);
5645   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5646   /* We had better not send unsafe characters to terminal.  */
5647   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5648   /* Characer composition should be disabled.  */
5649   terminal_coding.composing = COMPOSITION_DISABLED;
5650   terminal_coding.src_multibyte = 1;
5651   terminal_coding.dst_multibyte = 0;
5652   return Qnil;
5653 }
5654
5655 DEFUN ("set-safe-terminal-coding-system-internal",
5656        Fset_safe_terminal_coding_system_internal,
5657        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5658   (coding_system)
5659      Lisp_Object coding_system;
5660 {
5661   CHECK_SYMBOL (coding_system, 0);
5662   setup_coding_system (Fcheck_coding_system (coding_system),
5663                        &safe_terminal_coding);
5664   /* Characer composition should be disabled.  */
5665   safe_terminal_coding.composing = COMPOSITION_DISABLED;
5666   safe_terminal_coding.src_multibyte = 1;
5667   safe_terminal_coding.dst_multibyte = 0;
5668   return Qnil;
5669 }
5670
5671 DEFUN ("terminal-coding-system",
5672        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5673   "Return coding system specified for terminal output.")
5674   ()
5675 {
5676   return terminal_coding.symbol;
5677 }
5678
5679 DEFUN ("set-keyboard-coding-system-internal",
5680        Fset_keyboard_coding_system_internal,
5681        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5682   (coding_system)
5683      Lisp_Object coding_system;
5684 {
5685   CHECK_SYMBOL (coding_system, 0);
5686   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5687   /* Characer composition should be disabled.  */
5688   keyboard_coding.composing = COMPOSITION_DISABLED;
5689   return Qnil;
5690 }
5691
5692 DEFUN ("keyboard-coding-system",
5693        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5694   "Return coding system specified for decoding keyboard input.")
5695   ()
5696 {
5697   return keyboard_coding.symbol;
5698 }
5699
5700 \f
5701 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5702        Sfind_operation_coding_system,  1, MANY, 0,
5703   "Choose a coding system for an operation based on the target name.\n\
5704 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5705 DECODING-SYSTEM is the coding system to use for decoding\n\
5706 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5707 for encoding (in case OPERATION does encoding).\n\
5708 \n\
5709 The first argument OPERATION specifies an I/O primitive:\n\
5710   For file I/O, `insert-file-contents' or `write-region'.\n\
5711   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5712   For network I/O, `open-network-stream'.\n\
5713 \n\
5714 The remaining arguments should be the same arguments that were passed\n\
5715 to the primitive.  Depending on which primitive, one of those arguments\n\
5716 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5717 whichever argument specifies the file name is TARGET.\n\
5718 \n\
5719 TARGET has a meaning which depends on OPERATION:\n\
5720   For file I/O, TARGET is a file name.\n\
5721   For process I/O, TARGET is a process name.\n\
5722   For network I/O, TARGET is a service name or a port number\n\
5723 \n\
5724 This function looks up what specified for TARGET in,\n\
5725 `file-coding-system-alist', `process-coding-system-alist',\n\
5726 or `network-coding-system-alist' depending on OPERATION.\n\
5727 They may specify a coding system, a cons of coding systems,\n\
5728 or a function symbol to call.\n\
5729 In the last case, we call the function with one argument,\n\
5730 which is a list of all the arguments given to this function.")
5731   (nargs, args)
5732      int nargs;
5733      Lisp_Object *args;
5734 {
5735   Lisp_Object operation, target_idx, target, val;
5736   register Lisp_Object chain;
5737
5738   if (nargs < 2)
5739     error ("Too few arguments");
5740   operation = args[0];
5741   if (!SYMBOLP (operation)
5742       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5743     error ("Invalid first arguement");
5744   if (nargs < 1 + XINT (target_idx))
5745     error ("Too few arguments for operation: %s",
5746            XSYMBOL (operation)->name->data);
5747   target = args[XINT (target_idx) + 1];
5748   if (!(STRINGP (target)
5749         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5750     error ("Invalid %dth argument", XINT (target_idx) + 1);
5751
5752   chain = ((EQ (operation, Qinsert_file_contents)
5753             || EQ (operation, Qwrite_region))
5754            ? Vfile_coding_system_alist
5755            : (EQ (operation, Qopen_network_stream)
5756               ? Vnetwork_coding_system_alist
5757               : Vprocess_coding_system_alist));
5758   if (NILP (chain))
5759     return Qnil;
5760
5761   for (; CONSP (chain); chain = XCDR (chain))
5762     {
5763       Lisp_Object elt;
5764       elt = XCAR (chain);
5765
5766       if (CONSP (elt)
5767           && ((STRINGP (target)
5768                && STRINGP (XCAR (elt))
5769                && fast_string_match (XCAR (elt), target) >= 0)
5770               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5771         {
5772           val = XCDR (elt);
5773           /* Here, if VAL is both a valid coding system and a valid
5774              function symbol, we return VAL as a coding system.  */
5775           if (CONSP (val))
5776             return val;
5777           if (! SYMBOLP (val))
5778             return Qnil;
5779           if (! NILP (Fcoding_system_p (val)))
5780             return Fcons (val, val);
5781           if (! NILP (Ffboundp (val)))
5782             {
5783               val = call1 (val, Flist (nargs, args));
5784               if (CONSP (val))
5785                 return val;
5786               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5787                 return Fcons (val, val);
5788             }
5789           return Qnil;
5790         }
5791     }
5792   return Qnil;
5793 }
5794
5795 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5796        Supdate_coding_systems_internal, 0, 0, 0,
5797   "Update internal database for ISO2022 and CCL based coding systems.\n\
5798 When values of any coding categories are changed, you must\n\
5799 call this function")
5800   ()
5801 {
5802   int i;
5803
5804   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5805     {
5806       Lisp_Object val;
5807
5808       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5809       if (!NILP (val))
5810         {
5811           if (! coding_system_table[i])
5812             coding_system_table[i] = ((struct coding_system *)
5813                                       xmalloc (sizeof (struct coding_system)));
5814           setup_coding_system (val, coding_system_table[i]);
5815         }
5816       else if (coding_system_table[i])
5817         {
5818           xfree (coding_system_table[i]);
5819           coding_system_table[i] = NULL;
5820         }
5821     }
5822
5823   return Qnil;
5824 }
5825
5826 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5827        Sset_coding_priority_internal, 0, 0, 0,
5828   "Update internal database for the current value of `coding-category-list'.\n\
5829 This function is internal use only.")
5830   ()
5831 {
5832   int i = 0, idx;
5833   Lisp_Object val;
5834
5835   val = Vcoding_category_list;
5836
5837   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5838     {
5839       if (! SYMBOLP (XCAR (val)))
5840         break;
5841       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5842       if (idx >= CODING_CATEGORY_IDX_MAX)
5843         break;
5844       coding_priorities[i++] = (1 << idx);
5845       val = XCDR (val);
5846     }
5847   /* If coding-category-list is valid and contains all coding
5848      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5849      the following code saves Emacs from crashing.  */
5850   while (i < CODING_CATEGORY_IDX_MAX)
5851     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5852
5853   return Qnil;
5854 }
5855
5856 #endif /* emacs */
5857
5858 \f
5859 /*** 9. Post-amble ***/
5860
5861 void
5862 init_coding ()
5863 {
5864   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5865 }
5866
5867 void
5868 init_coding_once ()
5869 {
5870   int i;
5871
5872   /* Emacs' internal format specific initialize routine.  */
5873   for (i = 0; i <= 0x20; i++)
5874     emacs_code_class[i] = EMACS_control_code;
5875   emacs_code_class[0x0A] = EMACS_linefeed_code;
5876   emacs_code_class[0x0D] = EMACS_carriage_return_code;
5877   for (i = 0x21 ; i < 0x7F; i++)
5878     emacs_code_class[i] = EMACS_ascii_code;
5879   emacs_code_class[0x7F] = EMACS_control_code;
5880   for (i = 0x80; i < 0xFF; i++)
5881     emacs_code_class[i] = EMACS_invalid_code;
5882   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5883   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5884   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5885   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5886
5887   /* ISO2022 specific initialize routine.  */
5888   for (i = 0; i < 0x20; i++)
5889     iso_code_class[i] = ISO_control_0;
5890   for (i = 0x21; i < 0x7F; i++)
5891     iso_code_class[i] = ISO_graphic_plane_0;
5892   for (i = 0x80; i < 0xA0; i++)
5893     iso_code_class[i] = ISO_control_1;
5894   for (i = 0xA1; i < 0xFF; i++)
5895     iso_code_class[i] = ISO_graphic_plane_1;
5896   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5897   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5898   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5899   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5900   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5901   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5902   iso_code_class[ISO_CODE_ESC] = ISO_escape;
5903   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5904   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5905   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5906
5907   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5908
5909   setup_coding_system (Qnil, &keyboard_coding);
5910   setup_coding_system (Qnil, &terminal_coding);
5911   setup_coding_system (Qnil, &safe_terminal_coding);
5912   setup_coding_system (Qnil, &default_buffer_file_coding);
5913
5914   bzero (coding_system_table, sizeof coding_system_table);
5915
5916   bzero (ascii_skip_code, sizeof ascii_skip_code);
5917   for (i = 0; i < 128; i++)
5918     ascii_skip_code[i] = 1;
5919
5920 #if defined (MSDOS) || defined (WINDOWSNT)
5921   system_eol_type = CODING_EOL_CRLF;
5922 #else
5923   system_eol_type = CODING_EOL_LF;
5924 #endif
5925
5926   inhibit_pre_post_conversion = 0;
5927 }
5928
5929 #ifdef emacs
5930
5931 void
5932 syms_of_coding ()
5933 {
5934   Qtarget_idx = intern ("target-idx");
5935   staticpro (&Qtarget_idx);
5936
5937   Qcoding_system_history = intern ("coding-system-history");
5938   staticpro (&Qcoding_system_history);
5939   Fset (Qcoding_system_history, Qnil);
5940
5941   /* Target FILENAME is the first argument.  */
5942   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5943   /* Target FILENAME is the third argument.  */
5944   Fput (Qwrite_region, Qtarget_idx, make_number (2));
5945
5946   Qcall_process = intern ("call-process");
5947   staticpro (&Qcall_process);
5948   /* Target PROGRAM is the first argument.  */
5949   Fput (Qcall_process, Qtarget_idx, make_number (0));
5950
5951   Qcall_process_region = intern ("call-process-region");
5952   staticpro (&Qcall_process_region);
5953   /* Target PROGRAM is the third argument.  */
5954   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5955
5956   Qstart_process = intern ("start-process");
5957   staticpro (&Qstart_process);
5958   /* Target PROGRAM is the third argument.  */
5959   Fput (Qstart_process, Qtarget_idx, make_number (2));
5960
5961   Qopen_network_stream = intern ("open-network-stream");
5962   staticpro (&Qopen_network_stream);
5963   /* Target SERVICE is the fourth argument.  */
5964   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5965
5966   Qcoding_system = intern ("coding-system");
5967   staticpro (&Qcoding_system);
5968
5969   Qeol_type = intern ("eol-type");
5970   staticpro (&Qeol_type);
5971
5972   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5973   staticpro (&Qbuffer_file_coding_system);
5974
5975   Qpost_read_conversion = intern ("post-read-conversion");
5976   staticpro (&Qpost_read_conversion);
5977
5978   Qpre_write_conversion = intern ("pre-write-conversion");
5979   staticpro (&Qpre_write_conversion);
5980
5981   Qno_conversion = intern ("no-conversion");
5982   staticpro (&Qno_conversion);
5983
5984   Qundecided = intern ("undecided");
5985   staticpro (&Qundecided);
5986
5987   Qcoding_system_p = intern ("coding-system-p");
5988   staticpro (&Qcoding_system_p);
5989
5990   Qcoding_system_error = intern ("coding-system-error");
5991   staticpro (&Qcoding_system_error);
5992
5993   Fput (Qcoding_system_error, Qerror_conditions,
5994         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5995   Fput (Qcoding_system_error, Qerror_message,
5996         build_string ("Invalid coding system"));
5997
5998   Qcoding_category = intern ("coding-category");
5999   staticpro (&Qcoding_category);
6000   Qcoding_category_index = intern ("coding-category-index");
6001   staticpro (&Qcoding_category_index);
6002
6003   Vcoding_category_table
6004     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6005   staticpro (&Vcoding_category_table);
6006   {
6007     int i;
6008     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6009       {
6010         XVECTOR (Vcoding_category_table)->contents[i]
6011           = intern (coding_category_name[i]);
6012         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6013               Qcoding_category_index, make_number (i));
6014       }
6015   }
6016
6017   Qtranslation_table = intern ("translation-table");
6018   staticpro (&Qtranslation_table);
6019   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6020
6021   Qtranslation_table_id = intern ("translation-table-id");
6022   staticpro (&Qtranslation_table_id);
6023
6024   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6025   staticpro (&Qtranslation_table_for_decode);
6026
6027   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6028   staticpro (&Qtranslation_table_for_encode);
6029
6030   Qsafe_charsets = intern ("safe-charsets");
6031   staticpro (&Qsafe_charsets);
6032
6033   Qvalid_codes = intern ("valid-codes");
6034   staticpro (&Qvalid_codes);
6035
6036   Qemacs_mule = intern ("emacs-mule");
6037   staticpro (&Qemacs_mule);
6038
6039   Qraw_text = intern ("raw-text");
6040   staticpro (&Qraw_text);
6041
6042   defsubr (&Scoding_system_p);
6043   defsubr (&Sread_coding_system);
6044   defsubr (&Sread_non_nil_coding_system);
6045   defsubr (&Scheck_coding_system);
6046   defsubr (&Sdetect_coding_region);
6047   defsubr (&Sdetect_coding_string);
6048   defsubr (&Sdecode_coding_region);
6049   defsubr (&Sencode_coding_region);
6050   defsubr (&Sdecode_coding_string);
6051   defsubr (&Sencode_coding_string);
6052   defsubr (&Sdecode_sjis_char);
6053   defsubr (&Sencode_sjis_char);
6054   defsubr (&Sdecode_big5_char);
6055   defsubr (&Sencode_big5_char);
6056   defsubr (&Sset_terminal_coding_system_internal);
6057   defsubr (&Sset_safe_terminal_coding_system_internal);
6058   defsubr (&Sterminal_coding_system);
6059   defsubr (&Sset_keyboard_coding_system_internal);
6060   defsubr (&Skeyboard_coding_system);
6061   defsubr (&Sfind_operation_coding_system);
6062   defsubr (&Supdate_coding_systems_internal);
6063   defsubr (&Sset_coding_priority_internal);
6064
6065   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6066     "List of coding systems.\n\
6067 \n\
6068 Do not alter the value of this variable manually.  This variable should be\n\
6069 updated by the functions `make-coding-system' and\n\
6070 `define-coding-system-alias'.");
6071   Vcoding_system_list = Qnil;
6072
6073   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6074     "Alist of coding system names.\n\
6075 Each element is one element list of coding system name.\n\
6076 This variable is given to `completing-read' as TABLE argument.\n\
6077 \n\
6078 Do not alter the value of this variable manually.  This variable should be\n\
6079 updated by the functions `make-coding-system' and\n\
6080 `define-coding-system-alias'.");
6081   Vcoding_system_alist = Qnil;
6082
6083   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6084     "List of coding-categories (symbols) ordered by priority.");
6085   {
6086     int i;
6087
6088     Vcoding_category_list = Qnil;
6089     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6090       Vcoding_category_list
6091         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6092                  Vcoding_category_list);
6093   }
6094
6095   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6096     "Specify the coding system for read operations.\n\
6097 It is useful to bind this variable with `let', but do not set it globally.\n\
6098 If the value is a coding system, it is used for decoding on read operation.\n\
6099 If not, an appropriate element is used from one of the coding system alists:\n\
6100 There are three such tables, `file-coding-system-alist',\n\
6101 `process-coding-system-alist', and `network-coding-system-alist'.");
6102   Vcoding_system_for_read = Qnil;
6103
6104   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6105     "Specify the coding system for write operations.\n\
6106 Programs bind this variable with `let', but you should not set it globally.\n\
6107 If the value is a coding system, it is used for encoding of output,\n\
6108 when writing it to a file and when sending it to a file or subprocess.\n\
6109 \n\
6110 If this does not specify a coding system, an appropriate element\n\
6111 is used from one of the coding system alists:\n\
6112 There are three such tables, `file-coding-system-alist',\n\
6113 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6114 For output to files, if the above procedure does not specify a coding system,\n\
6115 the value of `buffer-file-coding-system' is used.");
6116   Vcoding_system_for_write = Qnil;
6117
6118   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6119     "Coding system used in the latest file or process I/O.");
6120   Vlast_coding_system_used = Qnil;
6121
6122   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6123     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6124 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6125 such conversion.");
6126   inhibit_eol_conversion = 0;
6127
6128   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6129     "Non-nil means process buffer inherits coding system of process output.\n\
6130 Bind it to t if the process output is to be treated as if it were a file\n\
6131 read from some filesystem.");
6132   inherit_process_coding_system = 0;
6133
6134   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6135     "Alist to decide a coding system to use for a file I/O operation.\n\
6136 The format is ((PATTERN . VAL) ...),\n\
6137 where PATTERN is a regular expression matching a file name,\n\
6138 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6139 If VAL is a coding system, it is used for both decoding and encoding\n\
6140 the file contents.\n\
6141 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6142 and the cdr part is used for encoding.\n\
6143 If VAL is a function symbol, the function must return a coding system\n\
6144 or a cons of coding systems which are used as above.\n\
6145 \n\
6146 See also the function `find-operation-coding-system'\n\
6147 and the variable `auto-coding-alist'.");
6148   Vfile_coding_system_alist = Qnil;
6149
6150   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6151     "Alist to decide a coding system to use for a process I/O operation.\n\
6152 The format is ((PATTERN . VAL) ...),\n\
6153 where PATTERN is a regular expression matching a program name,\n\
6154 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6155 If VAL is a coding system, it is used for both decoding what received\n\
6156 from the program and encoding what sent to the program.\n\
6157 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6158 and the cdr part is used for encoding.\n\
6159 If VAL is a function symbol, the function must return a coding system\n\
6160 or a cons of coding systems which are used as above.\n\
6161 \n\
6162 See also the function `find-operation-coding-system'.");
6163   Vprocess_coding_system_alist = Qnil;
6164
6165   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6166     "Alist to decide a coding system to use for a network I/O operation.\n\
6167 The format is ((PATTERN . VAL) ...),\n\
6168 where PATTERN is a regular expression matching a network service name\n\
6169 or is a port number to connect to,\n\
6170 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6171 If VAL is a coding system, it is used for both decoding what received\n\
6172 from the network stream and encoding what sent to the network stream.\n\
6173 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6174 and the cdr part is used for encoding.\n\
6175 If VAL is a function symbol, the function must return a coding system\n\
6176 or a cons of coding systems which are used as above.\n\
6177 \n\
6178 See also the function `find-operation-coding-system'.");
6179   Vnetwork_coding_system_alist = Qnil;
6180
6181   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6182     "Coding system to use with system messages.");
6183   Vlocale_coding_system = Qnil;
6184
6185   /* The eol mnemonics are reset in startup.el system-dependently.  */
6186   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6187     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6188   eol_mnemonic_unix = build_string (":");
6189
6190   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6191     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6192   eol_mnemonic_dos = build_string ("\\");
6193
6194   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6195     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6196   eol_mnemonic_mac = build_string ("/");
6197
6198   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6199     "*String displayed in mode line when end-of-line format is not yet determined.");
6200   eol_mnemonic_undecided = build_string (":");
6201
6202   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6203     "*Non-nil enables character translation while encoding and decoding.");
6204   Venable_character_translation = Qt;
6205
6206   DEFVAR_LISP ("standard-translation-table-for-decode",
6207     &Vstandard_translation_table_for_decode,
6208     "Table for translating characters while decoding.");
6209   Vstandard_translation_table_for_decode = Qnil;
6210
6211   DEFVAR_LISP ("standard-translation-table-for-encode",
6212     &Vstandard_translation_table_for_encode,
6213     "Table for translationg characters while encoding.");
6214   Vstandard_translation_table_for_encode = Qnil;
6215
6216   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6217     "Alist of charsets vs revision numbers.\n\
6218 While encoding, if a charset (car part of an element) is found,\n\
6219 designate it with the escape sequence identifing revision (cdr part of the element).");
6220   Vcharset_revision_alist = Qnil;
6221
6222   DEFVAR_LISP ("default-process-coding-system",
6223                &Vdefault_process_coding_system,
6224     "Cons of coding systems used for process I/O by default.\n\
6225 The car part is used for decoding a process output,\n\
6226 the cdr part is used for encoding a text to be sent to a process.");
6227   Vdefault_process_coding_system = Qnil;
6228
6229   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6230     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6231 This is a vector of length 256.\n\
6232 If Nth element is non-nil, the existence of code N in a file\n\
6233 \(or output of subprocess) doesn't prevent it to be detected as\n\
6234 a coding system of ISO 2022 variant which has a flag\n\
6235 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6236 or reading output of a subprocess.\n\
6237 Only 128th through 159th elements has a meaning.");
6238   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6239
6240   DEFVAR_LISP ("select-safe-coding-system-function",
6241                &Vselect_safe_coding_system_function,
6242     "Function to call to select safe coding system for encoding a text.\n\
6243 \n\
6244 If set, this function is called to force a user to select a proper\n\
6245 coding system which can encode the text in the case that a default\n\
6246 coding system used in each operation can't encode the text.\n\
6247 \n\
6248 The default value is `select-safe-coding-system' (which see).");
6249   Vselect_safe_coding_system_function = Qnil;
6250
6251 }
6252
6253 char *
6254 emacs_strerror (error_number)
6255      int error_number;
6256 {
6257   char *str;
6258
6259   synchronize_system_messages_locale ();
6260   str = strerror (error_number);
6261
6262   if (! NILP (Vlocale_coding_system))
6263     {
6264       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6265                                                       Vlocale_coding_system,
6266                                                       0);
6267       str = (char *) XSTRING (dec)->data;
6268     }
6269
6270   return str;
6271 }
6272
6273 #endif /* emacs */
6274