src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  */
 116 #if 0
 117 int
 118 detect_coding_emacs_mule (src, src_end)
 119      unsigned char *src, *src_end;
 120 {
 121   ...
 122 }
 123 #endif
 124
 125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 126
 127   These functions decode SRC_BYTES length of unibyte text at SOURCE
 128   encoded in CODING to Emacs' internal format.  The resulting
 129   multibyte text goes to a place pointed to by DESTINATION, the length
 130   of which should not exceed DST_BYTES.
 131
 132   These functions set the information of original and decoded texts in
 133   the members produced, produced_char, consumed, and consumed_char of
 134   the structure *CODING.  They also set the member result to one of
 135   CODING_FINISH_XXX indicating how the decoding finished.
 136
 137   DST_BYTES zero means that source area and destination area are
 138   overlapped, which means that we can produce a decoded text until it
 139   reaches at the head of not-yet-decoded source text.
 140
 141   Below is a template of these functions.  */
 142 #if 0
 143 static void
 144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148 {
 149   ...
 150 }
 151 #endif
 152
 153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 154
 155   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 156   internal multibyte format to CODING.  The resulting unibyte text
 157   goes to a place pointed to by DESTINATION, the length of which
 158   should not exceed DST_BYTES.
 159
 160   These functions set the information of original and encoded texts in
 161   the members produced, produced_char, consumed, and consumed_char of
 162   the structure *CODING.  They also set the member result to one of
 163   CODING_FINISH_XXX indicating how the encoding finished.
 164
 165   DST_BYTES zero means that source area and destination area are
 166   overlapped, which means that we can produce a encoded text until it
 167   reaches at the head of not-yet-encoded source text.
 168
 169   Below is a template of these functions.  */
 170 #if 0
 171 static void
 172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 173      struct coding_system *coding;
 174      unsigned char *source, *destination;
 175      int src_bytes, dst_bytes;
 176 {
 177   ...
 178 }
 179 #endif
 180
 181 /*** COMMONLY USED MACROS ***/
 182
 183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 184    get one, two, and three bytes from the source text respectively.
 185    If there are not enough bytes in the source, they jump to
 186    `label_end_of_loop'.  The caller should set variables `coding',
 187    `src' and `src_end' to appropriate pointer in advance.  These
 188    macros are called from decoding routines `decode_coding_XXX', thus
 189    it is assumed that the source text is unibyte.  */
 190
 191 #define ONE_MORE_BYTE(c1)                                       \
 192   do {                                                          \
 193     if (src >= src_end)                                         \
 194       {                                                         \
 195         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 196         goto label_end_of_loop;                                 \
 197       }                                                         \
 198     c1 = *src++;                                                \
 199   } while (0)
 200
 201 #define TWO_MORE_BYTES(c1, c2)                                  \
 202   do {                                                          \
 203     if (src + 1 >= src_end)                                     \
 204       {                                                         \
 205         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 206         goto label_end_of_loop;                                 \
 207       }                                                         \
 208     c1 = *src++;                                                \
 209     c2 = *src++;                                                \
 210   } while (0)
 211
 212
 213 /* Set C to the next character at the source text pointed by `src'.
 214    If there are not enough characters in the source, jump to
 215    `label_end_of_loop'.  The caller should set variables `coding'
 216    `src', `src_end', and `translation_table' to appropriate pointers
 217    in advance.  This macro is used in encoding routines
 218    `encode_coding_XXX', thus it assumes that the source text is in
 219    multibyte form except for 8-bit characters.  8-bit characters are
 220    in multibyte form if coding->src_multibyte is nonzero, else they
 221    are represented by a single byte.  */
 222
 223 #define ONE_MORE_CHAR(c)                                        \
 224   do {                                                          \
 225     int len = src_end - src;                                    \
 226     int bytes;                                                  \
 227     if (len <= 0)                                               \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         goto label_end_of_loop;                                 \
 231       }                                                         \
 232     if (coding->src_multibyte                                   \
 233         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 234       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 235     else                                                        \
 236       c = *src, bytes = 1;                                      \
 237     if (!NILP (translation_table))                              \
 238       c = translate_char (translation_table, c, 0, 0, 0);       \
 239     src += bytes;                                               \
 240   } while (0)
 241
 242
 243 /* Produce a multibyte form of characater C to `dst'.  Jump to
 244    `label_end_of_loop' if there's not enough space at `dst'.
 245
 246    If we are now in the middle of composition sequence, the decoded
 247    character may be ALTCHAR (for the current composition).  In that
 248    case, the character goes to coding->cmp_data->data instead of
 249    `dst'.
 250
 251    This macro is used in decoding routines.  */
 252
 253 #define EMIT_CHAR(c)                                                    \
 254   do {                                                                  \
 255     if (! COMPOSING_P (coding)                                          \
 256         || coding->composing == COMPOSITION_RELATIVE                    \
 257         || coding->composing == COMPOSITION_WITH_RULE)                  \
 258       {                                                                 \
 259         int bytes = CHAR_BYTES (c);                                     \
 260         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 261           {                                                             \
 262             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 263             goto label_end_of_loop;                                     \
 264           }                                                             \
 265         dst += CHAR_STRING (c, dst);                                    \
 266         coding->produced_char++;                                        \
 267       }                                                                 \
 268                                                                         \
 269     if (COMPOSING_P (coding)                                            \
 270         && coding->composing != COMPOSITION_RELATIVE)                   \
 271       {                                                                 \
 272         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 273         coding->composition_rule_follows                                \
 274           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 275       }                                                                 \
 276   } while (0)
 277
 278
 279 #define EMIT_ONE_BYTE(c)                                        \
 280   do {                                                          \
 281     if (dst >= (dst_bytes ? dst_end : src))                     \
 282       {                                                         \
 283         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 284         goto label_end_of_loop;                                 \
 285       }                                                         \
 286     *dst++ = c;                                                 \
 287   } while (0)
 288
 289 #define EMIT_TWO_BYTES(c1, c2)                                  \
 290   do {                                                          \
 291     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 292       {                                                         \
 293         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 294         goto label_end_of_loop;                                 \
 295       }                                                         \
 296     *dst++ = c1, *dst++ = c2;                                   \
 297   } while (0)
 298
 299 #define EMIT_BYTES(from, to)                                    \
 300   do {                                                          \
 301     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     while (from < to)                                           \
 307       *dst++ = *from++;                                         \
 308   } while (0)
 309
 310 \f
 311 /*** 1. Preamble ***/
 312
 313 #ifdef emacs
 314 #include <config.h>
 315 #endif
 316
 317 #include <stdio.h>
 318
 319 #ifdef emacs
 320
 321 #include "lisp.h"
 322 #include "buffer.h"
 323 #include "charset.h"
 324 #include "composite.h"
 325 #include "ccl.h"
 326 #include "coding.h"
 327 #include "window.h"
 328
 329 #else  /* not emacs */
 330
 331 #include "mulelib.h"
 332
 333 #endif /* not emacs */
 334
 335 Lisp_Object Qcoding_system, Qeol_type;
 336 Lisp_Object Qbuffer_file_coding_system;
 337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 338 Lisp_Object Qno_conversion, Qundecided;
 339 Lisp_Object Qcoding_system_history;
 340 Lisp_Object Qsafe_charsets;
 341 Lisp_Object Qvalid_codes;
 342
 343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 345 Lisp_Object Qstart_process, Qopen_network_stream;
 346 Lisp_Object Qtarget_idx;
 347
 348 Lisp_Object Vselect_safe_coding_system_function;
 349
 350 /* Mnemonic string for each format of end-of-line.  */
 351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 352 /* Mnemonic string to indicate format of end-of-line is not yet
 353    decided.  */
 354 Lisp_Object eol_mnemonic_undecided;
 355
 356 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 357    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 358 int system_eol_type;
 359
 360 #ifdef emacs
 361
 362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 363
 364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 365
 366 /* Coding system emacs-mule and raw-text are for converting only
 367    end-of-line format.  */
 368 Lisp_Object Qemacs_mule, Qraw_text;
 369
 370 /* Coding-systems are handed between Emacs Lisp programs and C internal
 371    routines by the following three variables.  */
 372 /* Coding-system for reading files and receiving data from process.  */
 373 Lisp_Object Vcoding_system_for_read;
 374 /* Coding-system for writing files and sending data to process.  */
 375 Lisp_Object Vcoding_system_for_write;
 376 /* Coding-system actually used in the latest I/O.  */
 377 Lisp_Object Vlast_coding_system_used;
 378
 379 /* A vector of length 256 which contains information about special
 380    Latin codes (especially for dealing with Microsoft codes).  */
 381 Lisp_Object Vlatin_extra_code_table;
 382
 383 /* Flag to inhibit code conversion of end-of-line format.  */
 384 int inhibit_eol_conversion;
 385
 386 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 387 int inherit_process_coding_system;
 388
 389 /* Coding system to be used to encode text for terminal display.  */
 390 struct coding_system terminal_coding;
 391
 392 /* Coding system to be used to encode text for terminal display when
 393    terminal coding system is nil.  */
 394 struct coding_system safe_terminal_coding;
 395
 396 /* Coding system of what is sent from terminal keyboard.  */
 397 struct coding_system keyboard_coding;
 398
 399 /* Default coding system to be used to write a file.  */
 400 struct coding_system default_buffer_file_coding;
 401
 402 Lisp_Object Vfile_coding_system_alist;
 403 Lisp_Object Vprocess_coding_system_alist;
 404 Lisp_Object Vnetwork_coding_system_alist;
 405
 406 Lisp_Object Vlocale_coding_system;
 407
 408 #endif /* emacs */
 409
 410 Lisp_Object Qcoding_category, Qcoding_category_index;
 411
 412 /* List of symbols `coding-category-xxx' ordered by priority.  */
 413 Lisp_Object Vcoding_category_list;
 414
 415 /* Table of coding categories (Lisp symbols).  */
 416 Lisp_Object Vcoding_category_table;
 417
 418 /* Table of names of symbol for each coding-category.  */
 419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 420   "coding-category-emacs-mule",
 421   "coding-category-sjis",
 422   "coding-category-iso-7",
 423   "coding-category-iso-7-tight",
 424   "coding-category-iso-8-1",
 425   "coding-category-iso-8-2",
 426   "coding-category-iso-7-else",
 427   "coding-category-iso-8-else",
 428   "coding-category-ccl",
 429   "coding-category-big5",
 430   "coding-category-utf-8",
 431   "coding-category-utf-16-be",
 432   "coding-category-utf-16-le",
 433   "coding-category-raw-text",
 434   "coding-category-binary"
 435 };
 436
 437 /* Table of pointers to coding systems corresponding to each coding
 438    categories.  */
 439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 440
 441 /* Table of coding category masks.  Nth element is a mask for a coding
 442    cateogry of which priority is Nth.  */
 443 static
 444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 445
 446 /* Flag to tell if we look up translation table on character code
 447    conversion.  */
 448 Lisp_Object Venable_character_translation;
 449 /* Standard translation table to look up on decoding (reading).  */
 450 Lisp_Object Vstandard_translation_table_for_decode;
 451 /* Standard translation table to look up on encoding (writing).  */
 452 Lisp_Object Vstandard_translation_table_for_encode;
 453
 454 Lisp_Object Qtranslation_table;
 455 Lisp_Object Qtranslation_table_id;
 456 Lisp_Object Qtranslation_table_for_decode;
 457 Lisp_Object Qtranslation_table_for_encode;
 458
 459 /* Alist of charsets vs revision number.  */
 460 Lisp_Object Vcharset_revision_alist;
 461
 462 /* Default coding systems used for process I/O.  */
 463 Lisp_Object Vdefault_process_coding_system;
 464
 465 /* Global flag to tell that we can't call post-read-conversion and
 466    pre-write-conversion functions.  Usually the value is zero, but it
 467    is set to 1 temporarily while such functions are running.  This is
 468    to avoid infinite recursive call.  */
 469 static int inhibit_pre_post_conversion;
 470
 471 \f
 472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 473
 474 /* Emacs' internal format for encoding multiple character sets is a
 475    kind of multi-byte encoding, i.e. characters are encoded by
 476    variable-length sequences of one-byte codes.
 477
 478    ASCII characters and control characters (e.g. `tab', `newline') are
 479    represented by one-byte sequences which are their ASCII codes, in
 480    the range 0x00 through 0x7F.
 481
 482    8-bit characters of the range 0x80..0x9F are represented by
 483    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 484    code + 0x20).
 485
 486    8-bit characters of the range 0xA0..0xFF are represented by
 487    one-byte sequences which are their 8-bit code.
 488
 489    The other characters are represented by a sequence of `base
 490    leading-code', optional `extended leading-code', and one or two
 491    `position-code's.  The length of the sequence is determined by the
 492    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 493    whereas extended leading-code and position-code take the range 0xA0
 494    through 0xFF.  See `charset.h' for more details about leading-code
 495    and position-code.
 496
 497    --- CODE RANGE of Emacs' internal format ---
 498    character set        range
 499    -------------        -----
 500    ascii                0x00..0x7F
 501    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 502    eight-bit-graphic    0xA0..0xBF
 503    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 504    ---------------------------------------------
 505
 506   */
 507
 508 enum emacs_code_class_type emacs_code_class[256];
 509
 510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 511    Check if a text is encoded in Emacs' internal format.  If it is,
 512    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 513
 514 int
 515 detect_coding_emacs_mule (src, src_end)
 516       unsigned char *src, *src_end;
 517 {
 518   unsigned char c;
 519   int composing = 0;
 520   /* Dummy for ONE_MORE_BYTE.  */
 521   struct coding_system dummy_coding;
 522   struct coding_system *coding = &dummy_coding;
 523
 524   while (1)
 525     {
 526       ONE_MORE_BYTE (c);
 527
 528       if (composing)
 529         {
 530           if (c < 0xA0)
 531             composing = 0;
 532           else if (c == 0xA0)
 533             {
 534               ONE_MORE_BYTE (c);
 535               c &= 0x7F;
 536             }
 537           else
 538             c -= 0x20;
 539         }
 540
 541       if (c < 0x20)
 542         {
 543           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 544             return 0;
 545         }
 546       else if (c >= 0x80 && c < 0xA0)
 547         {
 548           if (c == 0x80)
 549             /* Old leading code for a composite character.  */
 550             composing = 1;
 551           else
 552             {
 553               unsigned char *src_base = src - 1;
 554               int bytes;
 555
 556               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 557                                                bytes))
 558                 return 0;
 559               src = src_base + bytes;
 560             }
 561         }
 562     }
 563  label_end_of_loop:
 564   return CODING_CATEGORY_MASK_EMACS_MULE;
 565 }
 566
 567
 568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 569
 570 static void
 571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 572      struct coding_system *coding;
 573      unsigned char *source, *destination;
 574      int src_bytes, dst_bytes;
 575 {
 576   unsigned char *src = source;
 577   unsigned char *src_end = source + src_bytes;
 578   unsigned char *dst = destination;
 579   unsigned char *dst_end = destination + dst_bytes;
 580   /* SRC_BASE remembers the start position in source in each loop.
 581      The loop will be exited when there's not enough source code, or
 582      when there's not enough destination area to produce a
 583      character.  */
 584   unsigned char *src_base;
 585
 586   coding->produced_char = 0;
 587   while (src < src_end)
 588     {
 589       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 590       int bytes;
 591
 592       src_base = src;
 593       if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 594         {
 595           p = src;
 596           src += bytes;
 597         }
 598       else
 599         {
 600           bytes = CHAR_STRING (*src, tmp);
 601           p = tmp;
 602           src++;
 603         }
 604       if (dst + bytes >= (dst_bytes ? dst_end : src))
 605         {
 606           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 607           break;
 608         }
 609       while (bytes--) *dst++ = *p++;
 610       coding->produced_char++;
 611     }
 612   coding->consumed = coding->consumed_char = src_base - source;
 613   coding->produced = dst - destination;
 614 }
 615
 616 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 617   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 618
 619
 620 \f
 621 /*** 3. ISO2022 handlers ***/
 622
 623 /* The following note describes the coding system ISO2022 briefly.
 624    Since the intention of this note is to help understand the
 625    functions in this file, some parts are NOT ACCURATE or OVERLY
 626    SIMPLIFIED.  For thorough understanding, please refer to the
 627    original document of ISO2022.
 628
 629    ISO2022 provides many mechanisms to encode several character sets
 630    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 631    is encoded using bytes less than 128.  This may make the encoded
 632    text a little bit longer, but the text passes more easily through
 633    several gateways, some of which strip off MSB (Most Signigant Bit).
 634
 635    There are two kinds of character sets: control character set and
 636    graphic character set.  The former contains control characters such
 637    as `newline' and `escape' to provide control functions (control
 638    functions are also provided by escape sequences).  The latter
 639    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 640    two control character sets and many graphic character sets.
 641
 642    Graphic character sets are classified into one of the following
 643    four classes, according to the number of bytes (DIMENSION) and
 644    number of characters in one dimension (CHARS) of the set:
 645    - DIMENSION1_CHARS94
 646    - DIMENSION1_CHARS96
 647    - DIMENSION2_CHARS94
 648    - DIMENSION2_CHARS96
 649
 650    In addition, each character set is assigned an identification tag,
 651    unique for each set, called "final character" (denoted as <F>
 652    hereafter).  The <F> of each character set is decided by ECMA(*)
 653    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 654    (0x30..0x3F are for private use only).
 655
 656    Note (*): ECMA = European Computer Manufacturers Association
 657
 658    Here are examples of graphic character set [NAME(<F>)]:
 659         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 660         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 661         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 662         o DIMENSION2_CHARS96 -- none for the moment
 663
 664    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 665         C0 [0x00..0x1F] -- control character plane 0
 666         GL [0x20..0x7F] -- graphic character plane 0
 667         C1 [0x80..0x9F] -- control character plane 1
 668         GR [0xA0..0xFF] -- graphic character plane 1
 669
 670    A control character set is directly designated and invoked to C0 or
 671    C1 by an escape sequence.  The most common case is that:
 672    - ISO646's  control character set is designated/invoked to C0, and
 673    - ISO6429's control character set is designated/invoked to C1,
 674    and usually these designations/invocations are omitted in encoded
 675    text.  In a 7-bit environment, only C0 can be used, and a control
 676    character for C1 is encoded by an appropriate escape sequence to
 677    fit into the environment.  All control characters for C1 are
 678    defined to have corresponding escape sequences.
 679
 680    A graphic character set is at first designated to one of four
 681    graphic registers (G0 through G3), then these graphic registers are
 682    invoked to GL or GR.  These designations and invocations can be
 683    done independently.  The most common case is that G0 is invoked to
 684    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 685    these invocations and designations are omitted in encoded text.
 686    In a 7-bit environment, only GL can be used.
 687
 688    When a graphic character set of CHARS94 is invoked to GL, codes
 689    0x20 and 0x7F of the GL area work as control characters SPACE and
 690    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 691    be used.
 692
 693    There are two ways of invocation: locking-shift and single-shift.
 694    With locking-shift, the invocation lasts until the next different
 695    invocation, whereas with single-shift, the invocation affects the
 696    following character only and doesn't affect the locking-shift
 697    state.  Invocations are done by the following control characters or
 698    escape sequences:
 699
 700    ----------------------------------------------------------------------
 701    abbrev  function                  cntrl escape seq   description
 702    ----------------------------------------------------------------------
 703    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 704    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 705    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 706    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 707    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 708    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 709    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 710    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 711    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 712    ----------------------------------------------------------------------
 713    (*) These are not used by any known coding system.
 714
 715    Control characters for these functions are defined by macros
 716    ISO_CODE_XXX in `coding.h'.
 717
 718    Designations are done by the following escape sequences:
 719    ----------------------------------------------------------------------
 720    escape sequence      description
 721    ----------------------------------------------------------------------
 722    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 723    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 724    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 725    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 726    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 727    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 728    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 729    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 730    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 731    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 732    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 733    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 734    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 735    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 736    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 737    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 738    ----------------------------------------------------------------------
 739
 740    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 741    of dimension 1, chars 94, and final character <F>, etc...
 742
 743    Note (*): Although these designations are not allowed in ISO2022,
 744    Emacs accepts them on decoding, and produces them on encoding
 745    CHARS96 character sets in a coding system which is characterized as
 746    7-bit environment, non-locking-shift, and non-single-shift.
 747
 748    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 749    '(' can be omitted.  We refer to this as "short-form" hereafter.
 750
 751    Now you may notice that there are a lot of ways for encoding the
 752    same multilingual text in ISO2022.  Actually, there exist many
 753    coding systems such as Compound Text (used in X11's inter client
 754    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 755    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 756    localized platforms), and all of these are variants of ISO2022.
 757
 758    In addition to the above, Emacs handles two more kinds of escape
 759    sequences: ISO6429's direction specification and Emacs' private
 760    sequence for specifying character composition.
 761
 762    ISO6429's direction specification takes the following form:
 763         o CSI ']'      -- end of the current direction
 764         o CSI '0' ']'  -- end of the current direction
 765         o CSI '1' ']'  -- start of left-to-right text
 766         o CSI '2' ']'  -- start of right-to-left text
 767    The control character CSI (0x9B: control sequence introducer) is
 768    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 769
 770    Character composition specification takes the following form:
 771         o ESC '0' -- start relative composition
 772         o ESC '1' -- end composition
 773         o ESC '2' -- start rule-base composition (*)
 774         o ESC '3' -- start relative composition with alternate chars  (**)
 775         o ESC '4' -- start rule-base composition with alternate chars  (**)
 776   Since these are not standard escape sequences of any ISO standard,
 777   the use of them for these meaning is restricted to Emacs only.
 778
 779   (*) This form is used only in Emacs 20.5 and the older versions,
 780   but the newer versions can safely decode it.
 781   (**) This form is used only in Emacs 21.1 and the newer versions,
 782   and the older versions can't decode it.
 783
 784   Here's a list of examples usages of these composition escape
 785   sequences (categorized by `enum composition_method').
 786
 787   COMPOSITION_RELATIVE:
 788         ESC 0 CHAR [ CHAR ] ESC 1
 789   COMPOSITOIN_WITH_RULE:
 790         ESC 2 CHAR [ RULE CHAR ] ESC 1
 791   COMPOSITION_WITH_ALTCHARS:
 792         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 793   COMPOSITION_WITH_RULE_ALTCHARS:
 794         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 795
 796 enum iso_code_class_type iso_code_class[256];
 797
 798 #define CHARSET_OK(idx, charset)                                \
 799   (coding_system_table[idx]                                     \
 800    && (coding_system_table[idx]->safe_charsets[charset]         \
 801        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 802             (coding_system_table[idx], charset)                 \
 803            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 804
 805 #define SHIFT_OUT_OK(idx) \
 806   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 807
 808 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 809    Check if a text is encoded in ISO2022.  If it is, returns an
 810    integer in which appropriate flag bits any of:
 811         CODING_CATEGORY_MASK_ISO_7
 812         CODING_CATEGORY_MASK_ISO_7_TIGHT
 813         CODING_CATEGORY_MASK_ISO_8_1
 814         CODING_CATEGORY_MASK_ISO_8_2
 815         CODING_CATEGORY_MASK_ISO_7_ELSE
 816         CODING_CATEGORY_MASK_ISO_8_ELSE
 817    are set.  If a code which should never appear in ISO2022 is found,
 818    returns 0.  */
 819
 820 int
 821 detect_coding_iso2022 (src, src_end)
 822      unsigned char *src, *src_end;
 823 {
 824   int mask = CODING_CATEGORY_MASK_ISO;
 825   int mask_found = 0;
 826   int reg[4], shift_out = 0, single_shifting = 0;
 827   int c, c1, i, charset;
 828   /* Dummy for ONE_MORE_BYTE.  */
 829   struct coding_system dummy_coding;
 830   struct coding_system *coding = &dummy_coding;
 831
 832   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 833   while (mask && src < src_end)
 834     {
 835       ONE_MORE_BYTE (c);
 836       switch (c)
 837         {
 838         case ISO_CODE_ESC:
 839           single_shifting = 0;
 840           ONE_MORE_BYTE (c);
 841           if (c >= '(' && c <= '/')
 842             {
 843               /* Designation sequence for a charset of dimension 1.  */
 844               ONE_MORE_BYTE (c1);
 845               if (c1 < ' ' || c1 >= 0x80
 846                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 847                 /* Invalid designation sequence.  Just ignore.  */
 848                 break;
 849               reg[(c - '(') % 4] = charset;
 850             }
 851           else if (c == '$')
 852             {
 853               /* Designation sequence for a charset of dimension 2.  */
 854               ONE_MORE_BYTE (c);
 855               if (c >= '@' && c <= 'B')
 856                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 857                 reg[0] = charset = iso_charset_table[1][0][c];
 858               else if (c >= '(' && c <= '/')
 859                 {
 860                   ONE_MORE_BYTE (c1);
 861                   if (c1 < ' ' || c1 >= 0x80
 862                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 863                     /* Invalid designation sequence.  Just ignore.  */
 864                     break;
 865                   reg[(c - '(') % 4] = charset;
 866                 }
 867               else
 868                 /* Invalid designation sequence.  Just ignore.  */
 869                 break;
 870             }
 871           else if (c == 'N' || c == 'O')
 872             {
 873               /* ESC <Fe> for SS2 or SS3.  */
 874               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 875               break;
 876             }
 877           else if (c >= '0' && c <= '4')
 878             {
 879               /* ESC <Fp> for start/end composition.  */
 880               mask_found |= CODING_CATEGORY_MASK_ISO;
 881               break;
 882             }
 883           else
 884             /* Invalid escape sequence.  Just ignore.  */
 885             break;
 886
 887           /* We found a valid designation sequence for CHARSET.  */
 888           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 889           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 890             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 891           else
 892             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 893           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 894             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 895           else
 896             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 897           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 898             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 899           else
 900             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 901           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 902             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 903           else
 904             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 905           break;
 906
 907         case ISO_CODE_SO:
 908           single_shifting = 0;
 909           if (shift_out == 0
 910               && (reg[1] >= 0
 911                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 912                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 913             {
 914               /* Locking shift out.  */
 915               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 916               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 917             }
 918           break;
 919
 920         case ISO_CODE_SI:
 921           single_shifting = 0;
 922           if (shift_out == 1)
 923             {
 924               /* Locking shift in.  */
 925               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 926               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 927             }
 928           break;
 929
 930         case ISO_CODE_CSI:
 931           single_shifting = 0;
 932         case ISO_CODE_SS2:
 933         case ISO_CODE_SS3:
 934           {
 935             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 936
 937             if (c != ISO_CODE_CSI)
 938               {
 939                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 940                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 941                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 942                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 943                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 944                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 945                 single_shifting = 1;
 946               }
 947             if (VECTORP (Vlatin_extra_code_table)
 948                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 949               {
 950                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 951                     & CODING_FLAG_ISO_LATIN_EXTRA)
 952                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 953                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 954                     & CODING_FLAG_ISO_LATIN_EXTRA)
 955                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 956               }
 957             mask &= newmask;
 958             mask_found |= newmask;
 959           }
 960           break;
 961
 962         default:
 963           if (c < 0x80)
 964             {
 965               single_shifting = 0;
 966               break;
 967             }
 968           else if (c < 0xA0)
 969             {
 970               single_shifting = 0;
 971               if (VECTORP (Vlatin_extra_code_table)
 972                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 973                 {
 974                   int newmask = 0;
 975
 976                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 977                       & CODING_FLAG_ISO_LATIN_EXTRA)
 978                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 979                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 980                       & CODING_FLAG_ISO_LATIN_EXTRA)
 981                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 982                   mask &= newmask;
 983                   mask_found |= newmask;
 984                 }
 985               else
 986                 return 0;
 987             }
 988           else
 989             {
 990               unsigned char *src_begin = src;
 991
 992               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 993                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 994               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 995               /* Check the length of succeeding codes of the range
 996                  0xA0..0FF.  If the byte length is odd, we exclude
 997                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 998                  when we are not single shifting.  */
 999               if (!single_shifting
1000                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1001                 {
1002                   int i = 0;
1003                   while (src < src_end)
1004                     {
1005                       ONE_MORE_BYTE (c);
1006                       if (c < 0xA0)
1007                         break;
1008                       i++;
1009                     }
1010
1011                   if (i & 1 && src < src_end)
1012                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1013                   else
1014                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1015                 }
1016             }
1017           break;
1018         }
1019     }
1020  label_end_of_loop:
1021   return (mask & mask_found);
1022 }
1023
1024 /* Decode a character of which charset is CHARSET, the 1st position
1025    code is C1, the 2nd position code is C2, and return the decoded
1026    character code.  If the variable `translation_table' is non-nil,
1027    returned the translated code.  */
1028
1029 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1030   (NILP (translation_table)                     \
1031    ? MAKE_CHAR (charset, c1, c2)                \
1032    : translate_char (translation_table, -1, charset, c1, c2))
1033
1034 /* Set designation state into CODING.  */
1035 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1036   do {                                                                     \
1037     int charset;                                                           \
1038                                                                            \
1039     if (final_char < '0' || final_char >= 128)                             \
1040       goto label_invalid_code;                                             \
1041     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1042                                  make_number (chars),                      \
1043                                  make_number (final_char));                \
1044     if (charset >= 0                                                       \
1045         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1046             || coding->safe_charsets[charset]))                            \
1047       {                                                                    \
1048         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1049             && reg == 0                                                    \
1050             && charset == CHARSET_ASCII)                                   \
1051           {                                                                \
1052             /* We should insert this designation sequence as is so         \
1053                that it is surely written back to a file.  */               \
1054             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1055             goto label_invalid_code;                                       \
1056           }                                                                \
1057         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1058         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1059             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1060           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1061         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1062       }                                                                    \
1063     else                                                                   \
1064       {                                                                    \
1065         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1066         goto label_invalid_code;                                           \
1067       }                                                                    \
1068   } while (0)
1069
1070 /* Allocate a memory block for storing information about compositions.
1071    The block is chained to the already allocated blocks.  */
1072
1073 void
1074 coding_allocate_composition_data (coding, char_offset)
1075      struct coding_system *coding;
1076      int char_offset;
1077 {
1078   struct composition_data *cmp_data
1079     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1080
1081   cmp_data->char_offset = char_offset;
1082   cmp_data->used = 0;
1083   cmp_data->prev = coding->cmp_data;
1084   cmp_data->next = NULL;
1085   if (coding->cmp_data)
1086     coding->cmp_data->next = cmp_data;
1087   coding->cmp_data = cmp_data;
1088   coding->cmp_data_start = 0;
1089 }
1090
1091 /* Record the starting position START and METHOD of one composition.  */
1092
1093 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1094   do {                                                          \
1095     struct composition_data *cmp_data = coding->cmp_data;       \
1096     int *data = cmp_data->data + cmp_data->used;                \
1097     coding->cmp_data_start = cmp_data->used;                    \
1098     data[0] = -1;                                               \
1099     data[1] = cmp_data->char_offset + start;                    \
1100     data[3] = (int) method;                                     \
1101     cmp_data->used += 4;                                        \
1102   } while (0)
1103
1104 /* Record the ending position END of the current composition.  */
1105
1106 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1107   do {                                                          \
1108     struct composition_data *cmp_data = coding->cmp_data;       \
1109     int *data = cmp_data->data + coding->cmp_data_start;        \
1110     data[0] = cmp_data->used - coding->cmp_data_start;          \
1111     data[2] = cmp_data->char_offset + end;                      \
1112   } while (0)
1113
1114 /* Record one COMPONENT (alternate character or composition rule).  */
1115
1116 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1117   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1118
1119 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1120
1121 #define DECODE_COMPOSITION_START(c1)                                       \
1122   do {                                                                     \
1123     if (coding->composing == COMPOSITION_DISABLED)                         \
1124       {                                                                    \
1125         *dst++ = ISO_CODE_ESC;                                             \
1126         *dst++ = c1 & 0x7f;                                                \
1127         coding->produced_char += 2;                                        \
1128       }                                                                    \
1129     else if (!COMPOSING_P (coding))                                        \
1130       {                                                                    \
1131         /* This is surely the start of a composition.  We must be sure     \
1132            that coding->cmp_data has enough space to store the             \
1133            information about the composition.  If not, terminate the       \
1134            current decoding loop, allocate one more memory block for       \
1135            coding->cmp_data in the calller, then start the decoding        \
1136            loop again.  We can't allocate memory here directly because     \
1137            it may cause buffer/string relocation.  */                      \
1138         if (!coding->cmp_data                                              \
1139             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1140                 >= COMPOSITION_DATA_SIZE))                                 \
1141           {                                                                \
1142             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1143             goto label_end_of_loop;                                        \
1144           }                                                                \
1145         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1146                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1147                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1148                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1149         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1150                                       coding->composing);                  \
1151         coding->composition_rule_follows = 0;                              \
1152       }                                                                    \
1153     else                                                                   \
1154       {                                                                    \
1155         /* We are already handling a composition.  If the method is        \
1156            the following two, the codes following the current escape       \
1157            sequence are actual characters stored in a buffer.  */          \
1158         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1159             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1160           {                                                                \
1161             coding->composing = COMPOSITION_RELATIVE;                      \
1162             coding->composition_rule_follows = 0;                          \
1163           }                                                                \
1164       }                                                                    \
1165   } while (0)
1166
1167 /* Handle compositoin end sequence ESC 1.  */
1168
1169 #define DECODE_COMPOSITION_END(c1)                                      \
1170   do {                                                                  \
1171     if (coding->composing == COMPOSITION_DISABLED)                      \
1172       {                                                                 \
1173         *dst++ = ISO_CODE_ESC;                                          \
1174         *dst++ = c1;                                                    \
1175         coding->produced_char += 2;                                     \
1176       }                                                                 \
1177     else                                                                \
1178       {                                                                 \
1179         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1180         coding->composing = COMPOSITION_NO;                             \
1181       }                                                                 \
1182   } while (0)
1183
1184 /* Decode a composition rule from the byte C1 (and maybe one more byte
1185    from SRC) and store one encoded composition rule in
1186    coding->cmp_data.  */
1187
1188 #define DECODE_COMPOSITION_RULE(c1)                                     \
1189   do {                                                                  \
1190     int rule = 0;                                                       \
1191     (c1) -= 32;                                                         \
1192     if (c1 < 81)                /* old format (before ver.21) */        \
1193       {                                                                 \
1194         int gref = (c1) / 9;                                            \
1195         int nref = (c1) % 9;                                            \
1196         if (gref == 4) gref = 10;                                       \
1197         if (nref == 4) nref = 10;                                       \
1198         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1199       }                                                                 \
1200     else if (c1 < 93)           /* new format (after ver.21) */         \
1201       {                                                                 \
1202         ONE_MORE_BYTE (c2);                                             \
1203         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1204       }                                                                 \
1205     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1206     coding->composition_rule_follows = 0;                               \
1207   } while (0)
1208
1209
1210 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1211
1212 static void
1213 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1214      struct coding_system *coding;
1215      unsigned char *source, *destination;
1216      int src_bytes, dst_bytes;
1217 {
1218   unsigned char *src = source;
1219   unsigned char *src_end = source + src_bytes;
1220   unsigned char *dst = destination;
1221   unsigned char *dst_end = destination + dst_bytes;
1222   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1223   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1224   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1225   /* SRC_BASE remembers the start position in source in each loop.
1226      The loop will be exited when there's not enough source code
1227      (within macro ONE_MORE_BYTE), or when there's not enough
1228      destination area to produce a character (within macro
1229      EMIT_CHAR).  */
1230   unsigned char *src_base;
1231   int c, charset;
1232   Lisp_Object translation_table;
1233
1234   if (NILP (Venable_character_translation))
1235     translation_table = Qnil;
1236   else
1237     {
1238       translation_table = coding->translation_table_for_decode;
1239       if (NILP (translation_table))
1240         translation_table = Vstandard_translation_table_for_decode;
1241     }
1242
1243   coding->result = CODING_FINISH_NORMAL;
1244
1245   while (1)
1246     {
1247       int c1, c2;
1248
1249       src_base = src;
1250       ONE_MORE_BYTE (c1);
1251
1252       /* We produce no character or one character.  */
1253       switch (iso_code_class [c1])
1254         {
1255         case ISO_0x20_or_0x7F:
1256           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1257             {
1258               DECODE_COMPOSITION_RULE (c1);
1259               continue;
1260             }
1261           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1262             {
1263               /* This is SPACE or DEL.  */
1264               charset = CHARSET_ASCII;
1265               break;
1266             }
1267           /* This is a graphic character, we fall down ...  */
1268
1269         case ISO_graphic_plane_0:
1270           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1271             {
1272               DECODE_COMPOSITION_RULE (c1);
1273               continue;
1274             }
1275           charset = charset0;
1276           break;
1277
1278         case ISO_0xA0_or_0xFF:
1279           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1280               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1281             goto label_invalid_code;
1282           /* This is a graphic character, we fall down ... */
1283
1284         case ISO_graphic_plane_1:
1285           if (charset1 < 0)
1286             goto label_invalid_code;
1287           charset = charset1;
1288           break;
1289
1290         case ISO_control_0:
1291           if (COMPOSING_P (coding))
1292             DECODE_COMPOSITION_END ('1');
1293
1294           /* All ISO2022 control characters in this class have the
1295              same representation in Emacs internal format.  */
1296           if (c1 == '\n'
1297               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1298               && (coding->eol_type == CODING_EOL_CR
1299                   || coding->eol_type == CODING_EOL_CRLF))
1300             {
1301               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1302               goto label_end_of_loop;
1303             }
1304           charset = CHARSET_ASCII;
1305           break;
1306
1307         case ISO_control_1:
1308           if (COMPOSING_P (coding))
1309             DECODE_COMPOSITION_END ('1');
1310           goto label_invalid_code;
1311
1312         case ISO_carriage_return:
1313           if (COMPOSING_P (coding))
1314             DECODE_COMPOSITION_END ('1');
1315
1316           if (coding->eol_type == CODING_EOL_CR)
1317             c1 = '\n';
1318           else if (coding->eol_type == CODING_EOL_CRLF)
1319             {
1320               ONE_MORE_BYTE (c1);
1321               if (c1 != ISO_CODE_LF)
1322                 {
1323                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1324                     {
1325                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1326                       goto label_end_of_loop;
1327                     }
1328                   src--;
1329                   c1 = '\r';
1330                 }
1331             }
1332           charset = CHARSET_ASCII;
1333           break;
1334
1335         case ISO_shift_out:
1336           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1337               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1338             goto label_invalid_code;
1339           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1340           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1341           continue;
1342
1343         case ISO_shift_in:
1344           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1345             goto label_invalid_code;
1346           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1347           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1348           continue;
1349
1350         case ISO_single_shift_2_7:
1351         case ISO_single_shift_2:
1352           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1353             goto label_invalid_code;
1354           /* SS2 is handled as an escape sequence of ESC 'N' */
1355           c1 = 'N';
1356           goto label_escape_sequence;
1357
1358         case ISO_single_shift_3:
1359           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1360             goto label_invalid_code;
1361           /* SS2 is handled as an escape sequence of ESC 'O' */
1362           c1 = 'O';
1363           goto label_escape_sequence;
1364
1365         case ISO_control_sequence_introducer:
1366           /* CSI is handled as an escape sequence of ESC '[' ...  */
1367           c1 = '[';
1368           goto label_escape_sequence;
1369
1370         case ISO_escape:
1371           ONE_MORE_BYTE (c1);
1372         label_escape_sequence:
1373           /* Escape sequences handled by Emacs are invocation,
1374              designation, direction specification, and character
1375              composition specification.  */
1376           switch (c1)
1377             {
1378             case '&':           /* revision of following character set */
1379               ONE_MORE_BYTE (c1);
1380               if (!(c1 >= '@' && c1 <= '~'))
1381                 goto label_invalid_code;
1382               ONE_MORE_BYTE (c1);
1383               if (c1 != ISO_CODE_ESC)
1384                 goto label_invalid_code;
1385               ONE_MORE_BYTE (c1);
1386               goto label_escape_sequence;
1387
1388             case '$':           /* designation of 2-byte character set */
1389               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1390                 goto label_invalid_code;
1391               ONE_MORE_BYTE (c1);
1392               if (c1 >= '@' && c1 <= 'B')
1393                 {       /* designation of JISX0208.1978, GB2312.1980,
1394                            or JISX0208.1980 */
1395                   DECODE_DESIGNATION (0, 2, 94, c1);
1396                 }
1397               else if (c1 >= 0x28 && c1 <= 0x2B)
1398                 {       /* designation of DIMENSION2_CHARS94 character set */
1399                   ONE_MORE_BYTE (c2);
1400                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1401                 }
1402               else if (c1 >= 0x2C && c1 <= 0x2F)
1403                 {       /* designation of DIMENSION2_CHARS96 character set */
1404                   ONE_MORE_BYTE (c2);
1405                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1406                 }
1407               else
1408                 goto label_invalid_code;
1409               /* We must update these variables now.  */
1410               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1411               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1412               continue;
1413
1414             case 'n':           /* invocation of locking-shift-2 */
1415               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1416                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1417                 goto label_invalid_code;
1418               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1419               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1420               continue;
1421
1422             case 'o':           /* invocation of locking-shift-3 */
1423               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1424                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1425                 goto label_invalid_code;
1426               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1427               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1428               continue;
1429
1430             case 'N':           /* invocation of single-shift-2 */
1431               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1432                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1433                 goto label_invalid_code;
1434               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1435               ONE_MORE_BYTE (c1);
1436               break;
1437
1438             case 'O':           /* invocation of single-shift-3 */
1439               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1440                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1441                 goto label_invalid_code;
1442               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1443               ONE_MORE_BYTE (c1);
1444               break;
1445
1446             case '0': case '2': case '3': case '4': /* start composition */
1447               DECODE_COMPOSITION_START (c1);
1448               continue;
1449
1450             case '1':           /* end composition */
1451               DECODE_COMPOSITION_END (c1);
1452               continue;
1453
1454             case '[':           /* specification of direction */
1455               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1456                 goto label_invalid_code;
1457               /* For the moment, nested direction is not supported.
1458                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1459                  left-to-right, and nozero means right-to-left.  */
1460               ONE_MORE_BYTE (c1);
1461               switch (c1)
1462                 {
1463                 case ']':       /* end of the current direction */
1464                   coding->mode &= ~CODING_MODE_DIRECTION;
1465
1466                 case '0':       /* end of the current direction */
1467                 case '1':       /* start of left-to-right direction */
1468                   ONE_MORE_BYTE (c1);
1469                   if (c1 == ']')
1470                     coding->mode &= ~CODING_MODE_DIRECTION;
1471                   else
1472                     goto label_invalid_code;
1473                   break;
1474
1475                 case '2':       /* start of right-to-left direction */
1476                   ONE_MORE_BYTE (c1);
1477                   if (c1 == ']')
1478                     coding->mode |= CODING_MODE_DIRECTION;
1479                   else
1480                     goto label_invalid_code;
1481                   break;
1482
1483                 default:
1484                   goto label_invalid_code;
1485                 }
1486               continue;
1487
1488             default:
1489               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1490                 goto label_invalid_code;
1491               if (c1 >= 0x28 && c1 <= 0x2B)
1492                 {       /* designation of DIMENSION1_CHARS94 character set */
1493                   ONE_MORE_BYTE (c2);
1494                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1495                 }
1496               else if (c1 >= 0x2C && c1 <= 0x2F)
1497                 {       /* designation of DIMENSION1_CHARS96 character set */
1498                   ONE_MORE_BYTE (c2);
1499                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1500                 }
1501               else
1502                 goto label_invalid_code;
1503               /* We must update these variables now.  */
1504               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1505               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1506               continue;
1507             }
1508         }
1509
1510       /* Now we know CHARSET and 1st position code C1 of a character.
1511          Produce a multibyte sequence for that character while getting
1512          2nd position code C2 if necessary.  */
1513       if (CHARSET_DIMENSION (charset) == 2)
1514         {
1515           ONE_MORE_BYTE (c2);
1516           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1517             /* C2 is not in a valid range.  */
1518             goto label_invalid_code;
1519         }
1520       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1521       EMIT_CHAR (c);
1522       continue;
1523
1524     label_invalid_code:
1525       coding->errors++;
1526       if (COMPOSING_P (coding))
1527         DECODE_COMPOSITION_END ('1');
1528       src = src_base;
1529       c = *src++;
1530       EMIT_CHAR (c);
1531     }
1532
1533  label_end_of_loop:
1534   coding->consumed = coding->consumed_char = src_base - source;
1535   coding->produced = dst - destination;
1536   return;
1537 }
1538
1539
1540 /* ISO2022 encoding stuff.  */
1541
1542 /*
1543    It is not enough to say just "ISO2022" on encoding, we have to
1544    specify more details.  In Emacs, each coding system of ISO2022
1545    variant has the following specifications:
1546         1. Initial designation to G0 thru G3.
1547         2. Allows short-form designation?
1548         3. ASCII should be designated to G0 before control characters?
1549         4. ASCII should be designated to G0 at end of line?
1550         5. 7-bit environment or 8-bit environment?
1551         6. Use locking-shift?
1552         7. Use Single-shift?
1553    And the following two are only for Japanese:
1554         8. Use ASCII in place of JIS0201-1976-Roman?
1555         9. Use JISX0208-1983 in place of JISX0208-1978?
1556    These specifications are encoded in `coding->flags' as flag bits
1557    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1558    details.
1559 */
1560
1561 /* Produce codes (escape sequence) for designating CHARSET to graphic
1562    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1563    '@', 'A', or 'B' and the coding system CODING allows, produce
1564    designation sequence of short-form.  */
1565
1566 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1567   do {                                                                  \
1568     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1569     char *intermediate_char_94 = "()*+";                                \
1570     char *intermediate_char_96 = ",-./";                                \
1571     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1572                                                                         \
1573     if (revision < 255)                                                 \
1574       {                                                                 \
1575         *dst++ = ISO_CODE_ESC;                                          \
1576         *dst++ = '&';                                                   \
1577         *dst++ = '@' + revision;                                        \
1578       }                                                                 \
1579     *dst++ = ISO_CODE_ESC;                                              \
1580     if (CHARSET_DIMENSION (charset) == 1)                               \
1581       {                                                                 \
1582         if (CHARSET_CHARS (charset) == 94)                              \
1583           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1584         else                                                            \
1585           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1586       }                                                                 \
1587     else                                                                \
1588       {                                                                 \
1589         *dst++ = '$';                                                   \
1590         if (CHARSET_CHARS (charset) == 94)                              \
1591           {                                                             \
1592             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1593                 || reg != 0                                             \
1594                 || final_char < '@' || final_char > 'B')                \
1595               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1596           }                                                             \
1597         else                                                            \
1598           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1599       }                                                                 \
1600     *dst++ = final_char;                                                \
1601     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1602   } while (0)
1603
1604 /* The following two macros produce codes (control character or escape
1605    sequence) for ISO2022 single-shift functions (single-shift-2 and
1606    single-shift-3).  */
1607
1608 #define ENCODE_SINGLE_SHIFT_2                           \
1609   do {                                                  \
1610     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1611       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1612     else                                                \
1613       *dst++ = ISO_CODE_SS2;                            \
1614     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1615   } while (0)
1616
1617 #define ENCODE_SINGLE_SHIFT_3                           \
1618   do {                                                  \
1619     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1620       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1621     else                                                \
1622       *dst++ = ISO_CODE_SS3;                            \
1623     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1624   } while (0)
1625
1626 /* The following four macros produce codes (control character or
1627    escape sequence) for ISO2022 locking-shift functions (shift-in,
1628    shift-out, locking-shift-2, and locking-shift-3).  */
1629
1630 #define ENCODE_SHIFT_IN                         \
1631   do {                                          \
1632     *dst++ = ISO_CODE_SI;                       \
1633     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1634   } while (0)
1635
1636 #define ENCODE_SHIFT_OUT                        \
1637   do {                                          \
1638     *dst++ = ISO_CODE_SO;                       \
1639     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1640   } while (0)
1641
1642 #define ENCODE_LOCKING_SHIFT_2                  \
1643   do {                                          \
1644     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1645     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1646   } while (0)
1647
1648 #define ENCODE_LOCKING_SHIFT_3                  \
1649   do {                                          \
1650     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1651     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1652   } while (0)
1653
1654 /* Produce codes for a DIMENSION1 character whose character set is
1655    CHARSET and whose position-code is C1.  Designation and invocation
1656    sequences are also produced in advance if necessary.  */
1657
1658 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1659   do {                                                                  \
1660     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1661       {                                                                 \
1662         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1663           *dst++ = c1 & 0x7F;                                           \
1664         else                                                            \
1665           *dst++ = c1 | 0x80;                                           \
1666         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1667         break;                                                          \
1668       }                                                                 \
1669     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1670       {                                                                 \
1671         *dst++ = c1 & 0x7F;                                             \
1672         break;                                                          \
1673       }                                                                 \
1674     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1675       {                                                                 \
1676         *dst++ = c1 | 0x80;                                             \
1677         break;                                                          \
1678       }                                                                 \
1679     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1680              && !coding->safe_charsets[charset])                        \
1681       {                                                                 \
1682         /* We should not encode this character, instead produce one or  \
1683            two `?'s.  */                                                \
1684         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1685         if (CHARSET_WIDTH (charset) == 2)                               \
1686           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1687         break;                                                          \
1688       }                                                                 \
1689     else                                                                \
1690       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1691          must invoke it, or, at first, designate it to some graphic     \
1692          register.  Then repeat the loop to actually produce the        \
1693          character.  */                                                 \
1694       dst = encode_invocation_designation (charset, coding, dst);       \
1695   } while (1)
1696
1697 /* Produce codes for a DIMENSION2 character whose character set is
1698    CHARSET and whose position-codes are C1 and C2.  Designation and
1699    invocation codes are also produced in advance if necessary.  */
1700
1701 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1702   do {                                                                  \
1703     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1704       {                                                                 \
1705         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1706           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1707         else                                                            \
1708           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1709         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1710         break;                                                          \
1711       }                                                                 \
1712     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1713       {                                                                 \
1714         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1715         break;                                                          \
1716       }                                                                 \
1717     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1718       {                                                                 \
1719         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1720         break;                                                          \
1721       }                                                                 \
1722     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1723              && !coding->safe_charsets[charset])                        \
1724       {                                                                 \
1725         /* We should not encode this character, instead produce one or  \
1726            two `?'s.  */                                                \
1727         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1728         if (CHARSET_WIDTH (charset) == 2)                               \
1729           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1730         break;                                                          \
1731       }                                                                 \
1732     else                                                                \
1733       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1734          must invoke it, or, at first, designate it to some graphic     \
1735          register.  Then repeat the loop to actually produce the        \
1736          character.  */                                                 \
1737       dst = encode_invocation_designation (charset, coding, dst);       \
1738   } while (1)
1739
1740 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                           \
1741   do {                                                                  \
1742     int alt_charset = charset;                                          \
1743                                                                         \
1744     if (CHARSET_DEFINED_P (charset))                                    \
1745       {                                                                 \
1746         if (CHARSET_DIMENSION (charset) == 1)                           \
1747           {                                                             \
1748             if (charset == CHARSET_ASCII                                \
1749                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)           \
1750               alt_charset = charset_latin_jisx0201;                     \
1751             ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1);          \
1752           }                                                             \
1753         else                                                            \
1754           {                                                             \
1755             if (charset == charset_jisx0208                             \
1756                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)          \
1757               alt_charset = charset_jisx0208_1978;                      \
1758             ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2);      \
1759           }                                                             \
1760       }                                                                 \
1761     else                                                                \
1762       {                                                                 \
1763         *dst++ = c1;                                                    \
1764         if (c2 >= 0)                                                    \
1765           *dst++ = c2;                                                  \
1766       }                                                                 \
1767   } while (0)
1768
1769 /* Produce designation and invocation codes at a place pointed by DST
1770    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1771    Return new DST.  */
1772
1773 unsigned char *
1774 encode_invocation_designation (charset, coding, dst)
1775      int charset;
1776      struct coding_system *coding;
1777      unsigned char *dst;
1778 {
1779   int reg;                      /* graphic register number */
1780
1781   /* At first, check designations.  */
1782   for (reg = 0; reg < 4; reg++)
1783     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1784       break;
1785
1786   if (reg >= 4)
1787     {
1788       /* CHARSET is not yet designated to any graphic registers.  */
1789       /* At first check the requested designation.  */
1790       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1791       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1792         /* Since CHARSET requests no special designation, designate it
1793            to graphic register 0.  */
1794         reg = 0;
1795
1796       ENCODE_DESIGNATION (charset, reg, coding);
1797     }
1798
1799   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1800       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1801     {
1802       /* Since the graphic register REG is not invoked to any graphic
1803          planes, invoke it to graphic plane 0.  */
1804       switch (reg)
1805         {
1806         case 0:                 /* graphic register 0 */
1807           ENCODE_SHIFT_IN;
1808           break;
1809
1810         case 1:                 /* graphic register 1 */
1811           ENCODE_SHIFT_OUT;
1812           break;
1813
1814         case 2:                 /* graphic register 2 */
1815           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1816             ENCODE_SINGLE_SHIFT_2;
1817           else
1818             ENCODE_LOCKING_SHIFT_2;
1819           break;
1820
1821         case 3:                 /* graphic register 3 */
1822           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1823             ENCODE_SINGLE_SHIFT_3;
1824           else
1825             ENCODE_LOCKING_SHIFT_3;
1826           break;
1827         }
1828     }
1829
1830   return dst;
1831 }
1832
1833 /* Produce 2-byte codes for encoded composition rule RULE.  */
1834
1835 #define ENCODE_COMPOSITION_RULE(rule)           \
1836   do {                                          \
1837     int gref, nref;                             \
1838     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1839     *dst++ = 32 + 81 + gref;                    \
1840     *dst++ = 32 + nref;                         \
1841   } while (0)
1842
1843 /* Produce codes for indicating the start of a composition sequence
1844    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1845    which specify information about the composition.  See the comment
1846    in coding.h for the format of DATA.  */
1847
1848 #define ENCODE_COMPOSITION_START(coding, data)                          \
1849   do {                                                                  \
1850     coding->composing = data[3];                                        \
1851     *dst++ = ISO_CODE_ESC;                                              \
1852     if (coding->composing == COMPOSITION_RELATIVE)                      \
1853       *dst++ = '0';                                                     \
1854     else                                                                \
1855       {                                                                 \
1856         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1857                   ? '3' : '4');                                         \
1858         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1859         coding->composition_rule_follows = 0;                           \
1860       }                                                                 \
1861   } while (0)
1862
1863 /* Produce codes for indicating the end of the current composition.  */
1864
1865 #define ENCODE_COMPOSITION_END(coding, data)                    \
1866   do {                                                          \
1867     *dst++ = ISO_CODE_ESC;                                      \
1868     *dst++ = '1';                                               \
1869     coding->cmp_data_start += data[0];                          \
1870     coding->composing = COMPOSITION_NO;                         \
1871     if (coding->cmp_data_start == coding->cmp_data->used        \
1872         && coding->cmp_data->next)                              \
1873       {                                                         \
1874         coding->cmp_data = coding->cmp_data->next;              \
1875         coding->cmp_data_start = 0;                             \
1876       }                                                         \
1877   } while (0)
1878
1879 /* Produce composition start sequence ESC 0.  Here, this sequence
1880    doesn't mean the start of a new composition but means that we have
1881    just produced components (alternate chars and composition rules) of
1882    the composition and the actual text follows in SRC.  */
1883
1884 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1885   do {                                          \
1886     *dst++ = ISO_CODE_ESC;                      \
1887     *dst++ = '0';                               \
1888     coding->composing = COMPOSITION_RELATIVE;   \
1889   } while (0)
1890
1891 /* The following three macros produce codes for indicating direction
1892    of text.  */
1893 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1894   do {                                                  \
1895     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1896       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1897     else                                                \
1898       *dst++ = ISO_CODE_CSI;                            \
1899   } while (0)
1900
1901 #define ENCODE_DIRECTION_R2L    \
1902   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1903
1904 #define ENCODE_DIRECTION_L2R    \
1905   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1906
1907 /* Produce codes for designation and invocation to reset the graphic
1908    planes and registers to initial state.  */
1909 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1910   do {                                                                      \
1911     int reg;                                                                \
1912     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1913       ENCODE_SHIFT_IN;                                                      \
1914     for (reg = 0; reg < 4; reg++)                                           \
1915       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1916           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1917               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1918         ENCODE_DESIGNATION                                                  \
1919           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1920   } while (0)
1921
1922 /* Produce designation sequences of charsets in the line started from
1923    SRC to a place pointed by DST, and return updated DST.
1924
1925    If the current block ends before any end-of-line, we may fail to
1926    find all the necessary designations.  */
1927
1928 static unsigned char *
1929 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1930      struct coding_system *coding;
1931      Lisp_Object translation_table;
1932      unsigned char *src, *src_end, *dst;
1933 {
1934   int charset, c, found = 0, reg;
1935   /* Table of charsets to be designated to each graphic register.  */
1936   int r[4];
1937
1938   for (reg = 0; reg < 4; reg++)
1939     r[reg] = -1;
1940
1941   while (found < 4)
1942     {
1943       ONE_MORE_CHAR (c);
1944       if (c == '\n')
1945         break;
1946
1947       charset = CHAR_CHARSET (c);
1948       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1949       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1950         {
1951           found++;
1952           r[reg] = charset;
1953         }
1954     }
1955
1956  label_end_of_loop:
1957   if (found)
1958     {
1959       for (reg = 0; reg < 4; reg++)
1960         if (r[reg] >= 0
1961             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1962           ENCODE_DESIGNATION (r[reg], reg, coding);
1963     }
1964
1965   return dst;
1966 }
1967
1968 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1969
1970 static void
1971 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1972      struct coding_system *coding;
1973      unsigned char *source, *destination;
1974      int src_bytes, dst_bytes;
1975 {
1976   unsigned char *src = source;
1977   unsigned char *src_end = source + src_bytes;
1978   unsigned char *dst = destination;
1979   unsigned char *dst_end = destination + dst_bytes;
1980   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1981      from DST_END to assure overflow checking is necessary only at the
1982      head of loop.  */
1983   unsigned char *adjusted_dst_end = dst_end - 19;
1984   /* SRC_BASE remembers the start position in source in each loop.
1985      The loop will be exited when there's not enough source text to
1986      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1987      there's not enough destination area to produce encoded codes
1988      (within macro EMIT_BYTES).  */
1989   unsigned char *src_base;
1990   int c;
1991   Lisp_Object translation_table;
1992
1993   if (NILP (Venable_character_translation))
1994     translation_table = Qnil;
1995   else
1996     {
1997       translation_table = coding->translation_table_for_encode;
1998       if (NILP (translation_table))
1999         translation_table = Vstandard_translation_table_for_encode;
2000     }
2001
2002   coding->consumed_char = 0;
2003   coding->errors = 0;
2004   while (1)
2005     {
2006       int charset, c1, c2;
2007
2008       src_base = src;
2009
2010       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2011         {
2012           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2013           break;
2014         }
2015
2016       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2017           && CODING_SPEC_ISO_BOL (coding))
2018         {
2019           /* We have to produce designation sequences if any now.  */
2020           dst = encode_designation_at_bol (coding, translation_table,
2021                                            src, src_end, dst);
2022           CODING_SPEC_ISO_BOL (coding) = 0;
2023         }
2024
2025       /* Check composition start and end.  */
2026       if (coding->composing != COMPOSITION_DISABLED
2027           && coding->cmp_data_start < coding->cmp_data->used)
2028         {
2029           struct composition_data *cmp_data = coding->cmp_data;
2030           int *data = cmp_data->data + coding->cmp_data_start;
2031           int this_pos = cmp_data->char_offset + coding->consumed_char;
2032
2033           if (coding->composing == COMPOSITION_RELATIVE)
2034             {
2035               if (this_pos == data[2])
2036                 {
2037                   ENCODE_COMPOSITION_END (coding, data);
2038                   cmp_data = coding->cmp_data;
2039                   data = cmp_data->data + coding->cmp_data_start;
2040                 }
2041             }
2042           else if (COMPOSING_P (coding))
2043             {
2044               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2045               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2046                 /* We have consumed components of the composition.
2047                    What follows in SRC is the compositions's base
2048                    text.  */
2049                 ENCODE_COMPOSITION_FAKE_START (coding);
2050               else
2051                 {
2052                   int c = cmp_data->data[coding->cmp_data_index++];
2053                   if (coding->composition_rule_follows)
2054                     {
2055                       ENCODE_COMPOSITION_RULE (c);
2056                       coding->composition_rule_follows = 0;
2057                     }
2058                   else
2059                     {
2060                       SPLIT_CHAR (c, charset, c1, c2);
2061                       ENCODE_ISO_CHARACTER (charset, c1, c2);
2062                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2063                         coding->composition_rule_follows = 1;
2064                     }
2065                   continue;
2066                 }
2067             }
2068           if (!COMPOSING_P (coding))
2069             {
2070               if (this_pos == data[1])
2071                 {
2072                   ENCODE_COMPOSITION_START (coding, data);
2073                   continue;
2074                 }
2075             }
2076         }
2077
2078       ONE_MORE_CHAR (c);
2079
2080       /* Now encode the character C.  */
2081       if (c < 0x20 || c == 0x7F)
2082         {
2083           if (c == '\r')
2084             {
2085               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2086                 {
2087                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2088                     ENCODE_RESET_PLANE_AND_REGISTER;
2089                   *dst++ = c;
2090                   continue;
2091                 }
2092               /* fall down to treat '\r' as '\n' ...  */
2093               c = '\n';
2094             }
2095           if (c == '\n')
2096             {
2097               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2098                 ENCODE_RESET_PLANE_AND_REGISTER;
2099               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2100                 bcopy (coding->spec.iso2022.initial_designation,
2101                        coding->spec.iso2022.current_designation,
2102                        sizeof coding->spec.iso2022.initial_designation);
2103               if (coding->eol_type == CODING_EOL_LF
2104                   || coding->eol_type == CODING_EOL_UNDECIDED)
2105                 *dst++ = ISO_CODE_LF;
2106               else if (coding->eol_type == CODING_EOL_CRLF)
2107                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2108               else
2109                 *dst++ = ISO_CODE_CR;
2110               CODING_SPEC_ISO_BOL (coding) = 1;
2111             }
2112           else
2113             {
2114               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2115                 ENCODE_RESET_PLANE_AND_REGISTER;
2116               *dst++ = c;
2117             }
2118         }
2119       else if (ASCII_BYTE_P (c))
2120         ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2121       else if (SINGLE_BYTE_CHAR_P (c))
2122         {
2123           *dst++ = c;
2124           coding->errors++;
2125         }
2126       else
2127         {
2128           SPLIT_CHAR (c, charset, c1, c2);
2129           ENCODE_ISO_CHARACTER (charset, c1, c2);
2130         }
2131
2132       coding->consumed_char++;
2133     }
2134
2135  label_end_of_loop:
2136   coding->consumed = src_base - source;
2137   coding->produced = coding->produced_char = dst - destination;
2138 }
2139
2140 \f
2141 /*** 4. SJIS and BIG5 handlers ***/
2142
2143 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2144    quite widely.  So, for the moment, Emacs supports them in the bare
2145    C code.  But, in the future, they may be supported only by CCL.  */
2146
2147 /* SJIS is a coding system encoding three character sets: ASCII, right
2148    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2149    as is.  A character of charset katakana-jisx0201 is encoded by
2150    "position-code + 0x80".  A character of charset japanese-jisx0208
2151    is encoded in 2-byte but two position-codes are divided and shifted
2152    so that it fit in the range below.
2153
2154    --- CODE RANGE of SJIS ---
2155    (character set)      (range)
2156    ASCII                0x00 .. 0x7F
2157    KATAKANA-JISX0201    0xA0 .. 0xDF
2158    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2159             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2160    -------------------------------
2161
2162 */
2163
2164 /* BIG5 is a coding system encoding two character sets: ASCII and
2165    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2166    character set and is encoded in two-byte.
2167
2168    --- CODE RANGE of BIG5 ---
2169    (character set)      (range)
2170    ASCII                0x00 .. 0x7F
2171    Big5 (1st byte)      0xA1 .. 0xFE
2172         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2173    --------------------------
2174
2175    Since the number of characters in Big5 is larger than maximum
2176    characters in Emacs' charset (96x96), it can't be handled as one
2177    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2178    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2179    contains frequently used characters and the latter contains less
2180    frequently used characters.  */
2181
2182 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2183    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2184    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2185    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2186
2187 /* Number of Big5 characters which have the same code in 1st byte.  */
2188 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2189
2190 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2191   do {                                                                  \
2192     unsigned int temp                                                   \
2193       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2194     if (b1 < 0xC9)                                                      \
2195       charset = charset_big5_1;                                         \
2196     else                                                                \
2197       {                                                                 \
2198         charset = charset_big5_2;                                       \
2199         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2200       }                                                                 \
2201     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2202     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2203   } while (0)
2204
2205 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2206   do {                                                                  \
2207     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2208     if (charset == charset_big5_2)                                      \
2209       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2210     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2211     b2 = temp % BIG5_SAME_ROW;                                          \
2212     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2213   } while (0)
2214
2215 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2216    Check if a text is encoded in SJIS.  If it is, return
2217    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2218
2219 int
2220 detect_coding_sjis (src, src_end)
2221      unsigned char *src, *src_end;
2222 {
2223   int c;
2224   /* Dummy for ONE_MORE_BYTE.  */
2225   struct coding_system dummy_coding;
2226   struct coding_system *coding = &dummy_coding;
2227
2228   while (1)
2229     {
2230       ONE_MORE_BYTE (c);
2231       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2232         {
2233           ONE_MORE_BYTE (c);
2234           if (c < 0x40)
2235             return 0;
2236         }
2237     }
2238  label_end_of_loop:
2239   return CODING_CATEGORY_MASK_SJIS;
2240 }
2241
2242 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2243    Check if a text is encoded in BIG5.  If it is, return
2244    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2245
2246 int
2247 detect_coding_big5 (src, src_end)
2248      unsigned char *src, *src_end;
2249 {
2250   int c;
2251   /* Dummy for ONE_MORE_BYTE.  */
2252   struct coding_system dummy_coding;
2253   struct coding_system *coding = &dummy_coding;
2254
2255   while (1)
2256     {
2257       ONE_MORE_BYTE (c);
2258       if (c >= 0xA1)
2259         {
2260           ONE_MORE_BYTE (c);
2261           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2262             return 0;
2263         }
2264     }
2265  label_end_of_loop:
2266   return CODING_CATEGORY_MASK_BIG5;
2267 }
2268
2269 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2270    Check if a text is encoded in UTF-8.  If it is, return
2271    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2272
2273 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2274 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2275 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2276 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2277 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2278 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2279 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2280
2281 int
2282 detect_coding_utf_8 (src, src_end)
2283      unsigned char *src, *src_end;
2284 {
2285   unsigned char c;
2286   int seq_maybe_bytes;
2287   /* Dummy for ONE_MORE_BYTE.  */
2288   struct coding_system dummy_coding;
2289   struct coding_system *coding = &dummy_coding;
2290
2291   while (1)
2292     {
2293       ONE_MORE_BYTE (c);
2294       if (UTF_8_1_OCTET_P (c))
2295         continue;
2296       else if (UTF_8_2_OCTET_LEADING_P (c))
2297         seq_maybe_bytes = 1;
2298       else if (UTF_8_3_OCTET_LEADING_P (c))
2299         seq_maybe_bytes = 2;
2300       else if (UTF_8_4_OCTET_LEADING_P (c))
2301         seq_maybe_bytes = 3;
2302       else if (UTF_8_5_OCTET_LEADING_P (c))
2303         seq_maybe_bytes = 4;
2304       else if (UTF_8_6_OCTET_LEADING_P (c))
2305         seq_maybe_bytes = 5;
2306       else
2307         return 0;
2308
2309       do
2310         {
2311           ONE_MORE_BYTE (c);
2312           if (!UTF_8_EXTRA_OCTET_P (c))
2313             return 0;
2314           seq_maybe_bytes--;
2315         }
2316       while (seq_maybe_bytes > 0);
2317     }
2318
2319  label_end_of_loop:
2320   return CODING_CATEGORY_MASK_UTF_8;
2321 }
2322
2323 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2324    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2325    Little Endian (otherwise).  If it is, return
2326    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2327    else return 0.  */
2328
2329 #define UTF_16_INVALID_P(val)   \
2330   (((val) == 0xFFFE)            \
2331    || ((val) == 0xFFFF))
2332
2333 #define UTF_16_HIGH_SURROGATE_P(val) \
2334   (((val) & 0xD800) == 0xD800)
2335
2336 #define UTF_16_LOW_SURROGATE_P(val) \
2337   (((val) & 0xDC00) == 0xDC00)
2338
2339 int
2340 detect_coding_utf_16 (src, src_end)
2341      unsigned char *src, *src_end;
2342 {
2343   unsigned char c1, c2;
2344   /* Dummy for TWO_MORE_BYTES.  */
2345   struct coding_system dummy_coding;
2346   struct coding_system *coding = &dummy_coding;
2347
2348   TWO_MORE_BYTES (c1, c2);
2349
2350   if ((c1 == 0xFF) && (c2 == 0xFE))
2351     return CODING_CATEGORY_MASK_UTF_16_LE;
2352   else if ((c1 == 0xFE) && (c2 == 0xFF))
2353     return CODING_CATEGORY_MASK_UTF_16_BE;
2354
2355  label_end_of_loop:
2356   return 0;
2357 }
2358
2359 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2360    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2361
2362 static void
2363 decode_coding_sjis_big5 (coding, source, destination,
2364                          src_bytes, dst_bytes, sjis_p)
2365      struct coding_system *coding;
2366      unsigned char *source, *destination;
2367      int src_bytes, dst_bytes;
2368      int sjis_p;
2369 {
2370   unsigned char *src = source;
2371   unsigned char *src_end = source + src_bytes;
2372   unsigned char *dst = destination;
2373   unsigned char *dst_end = destination + dst_bytes;
2374   /* SRC_BASE remembers the start position in source in each loop.
2375      The loop will be exited when there's not enough source code
2376      (within macro ONE_MORE_BYTE), or when there's not enough
2377      destination area to produce a character (within macro
2378      EMIT_CHAR).  */
2379   unsigned char *src_base;
2380   Lisp_Object translation_table;
2381
2382   if (NILP (Venable_character_translation))
2383     translation_table = Qnil;
2384   else
2385     {
2386       translation_table = coding->translation_table_for_decode;
2387       if (NILP (translation_table))
2388         translation_table = Vstandard_translation_table_for_decode;
2389     }
2390
2391   coding->produced_char = 0;
2392   while (1)
2393     {
2394       int c, charset, c1, c2;
2395
2396       src_base = src;
2397       ONE_MORE_BYTE (c1);
2398
2399       if (c1 < 0x80)
2400         {
2401           charset = CHARSET_ASCII;
2402           if (c1 < 0x20)
2403             {
2404               if (c1 == '\r')
2405                 {
2406                   if (coding->eol_type == CODING_EOL_CRLF)
2407                     {
2408                       ONE_MORE_BYTE (c2);
2409                       if (c2 == '\n')
2410                         c1 = c2;
2411                       else if (coding->mode
2412                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2413                         {
2414                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2415                           goto label_end_of_loop;
2416                         }
2417                       else
2418                         /* To process C2 again, SRC is subtracted by 1.  */
2419                         src--;
2420                     }
2421                   else if (coding->eol_type == CODING_EOL_CR)
2422                     c1 = '\n';
2423                 }
2424               else if (c1 == '\n'
2425                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2426                        && (coding->eol_type == CODING_EOL_CR
2427                            || coding->eol_type == CODING_EOL_CRLF))
2428                 {
2429                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2430                   goto label_end_of_loop;
2431                 }
2432             }
2433         }
2434       else
2435         {
2436           if (sjis_p)
2437             {
2438               if (c1 >= 0xF0)
2439                 goto label_invalid_code;
2440               if (c1 < 0xA0 || c1 >= 0xE0)
2441                 {
2442                   /* SJIS -> JISX0208 */
2443                   ONE_MORE_BYTE (c2);
2444                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2445                     goto label_invalid_code;
2446                   DECODE_SJIS (c1, c2, c1, c2);
2447                   charset = charset_jisx0208;
2448                 }
2449               else
2450                 /* SJIS -> JISX0201-Kana */
2451                 charset = charset_katakana_jisx0201;
2452             }
2453           else
2454             {
2455               /* BIG5 -> Big5 */
2456               if (c1 < 0xA1 || c1 > 0xFE)
2457                 goto label_invalid_code;
2458               ONE_MORE_BYTE (c2);
2459               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2460                 goto label_invalid_code;
2461               DECODE_BIG5 (c1, c2, charset, c1, c2);
2462             }
2463         }
2464
2465       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2466       EMIT_CHAR (c);
2467       continue;
2468
2469     label_invalid_code:
2470       coding->errors++;
2471       src = src_base;
2472       c = *src++;
2473       EMIT_CHAR (c);
2474     }
2475
2476  label_end_of_loop:
2477   coding->consumed = coding->consumed_char = src_base - source;
2478   coding->produced = dst - destination;
2479   return;
2480 }
2481
2482 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2483    This function can encode charsets `ascii', `katakana-jisx0201',
2484    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2485    are sure that all these charsets are registered as official charset
2486    (i.e. do not have extended leading-codes).  Characters of other
2487    charsets are produced without any encoding.  If SJIS_P is 1, encode
2488    SJIS text, else encode BIG5 text.  */
2489
2490 static void
2491 encode_coding_sjis_big5 (coding, source, destination,
2492                          src_bytes, dst_bytes, sjis_p)
2493      struct coding_system *coding;
2494      unsigned char *source, *destination;
2495      int src_bytes, dst_bytes;
2496      int sjis_p;
2497 {
2498   unsigned char *src = source;
2499   unsigned char *src_end = source + src_bytes;
2500   unsigned char *dst = destination;
2501   unsigned char *dst_end = destination + dst_bytes;
2502   /* SRC_BASE remembers the start position in source in each loop.
2503      The loop will be exited when there's not enough source text to
2504      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2505      there's not enough destination area to produce encoded codes
2506      (within macro EMIT_BYTES).  */
2507   unsigned char *src_base;
2508   Lisp_Object translation_table;
2509
2510   if (NILP (Venable_character_translation))
2511     translation_table = Qnil;
2512   else
2513     {
2514       translation_table = coding->translation_table_for_decode;
2515       if (NILP (translation_table))
2516         translation_table = Vstandard_translation_table_for_decode;
2517     }
2518
2519   while (1)
2520     {
2521       int c, charset, c1, c2;
2522
2523       src_base = src;
2524       ONE_MORE_CHAR (c);
2525
2526       /* Now encode the character C.  */
2527       if (SINGLE_BYTE_CHAR_P (c))
2528         {
2529           switch (c)
2530             {
2531             case '\r':
2532               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2533                 {
2534                   EMIT_ONE_BYTE (c);
2535                   break;
2536                 }
2537               c = '\n';
2538             case '\n':
2539               if (coding->eol_type == CODING_EOL_CRLF)
2540                 {
2541                   EMIT_TWO_BYTES ('\r', c);
2542                   break;
2543                 }
2544               else if (coding->eol_type == CODING_EOL_CR)
2545                 c = '\r';
2546             default:
2547               EMIT_ONE_BYTE (c);
2548             }
2549         }
2550       else
2551         {
2552           SPLIT_CHAR (c, charset, c1, c2);
2553           if (sjis_p)
2554             {
2555               if (charset == charset_jisx0208
2556                   || charset == charset_jisx0208_1978)
2557                 {
2558                   ENCODE_SJIS (c1, c2, c1, c2);
2559                   EMIT_TWO_BYTES (c1, c2);
2560                 }
2561               else if (charset == charset_latin_jisx0201)
2562                 EMIT_ONE_BYTE (c1);
2563               else
2564                 /* There's no way other than producing the internal
2565                    codes as is.  */
2566                 EMIT_BYTES (src_base, src);
2567             }
2568           else
2569             {
2570               if (charset == charset_big5_1 || charset == charset_big5_2)
2571                 {
2572                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2573                   EMIT_TWO_BYTES (c1, c2);
2574                 }
2575               else
2576                 /* There's no way other than producing the internal
2577                    codes as is.  */
2578                 EMIT_BYTES (src_base, src);
2579             }
2580         }
2581       coding->consumed_char++;
2582     }
2583
2584  label_end_of_loop:
2585   coding->consumed = src_base - source;
2586   coding->produced = coding->produced_char = dst - destination;
2587 }
2588
2589 \f
2590 /*** 5. CCL handlers ***/
2591
2592 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2593    Check if a text is encoded in a coding system of which
2594    encoder/decoder are written in CCL program.  If it is, return
2595    CODING_CATEGORY_MASK_CCL, else return 0.  */
2596
2597 int
2598 detect_coding_ccl (src, src_end)
2599      unsigned char *src, *src_end;
2600 {
2601   unsigned char *valid;
2602   int c;
2603   /* Dummy for ONE_MORE_BYTE.  */
2604   struct coding_system dummy_coding;
2605   struct coding_system *coding = &dummy_coding;
2606
2607   /* No coding system is assigned to coding-category-ccl.  */
2608   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2609     return 0;
2610
2611   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2612   while (1)
2613     {
2614       ONE_MORE_BYTE (c);
2615       if (! valid[c])
2616         return 0;
2617     }
2618  label_end_of_loop:
2619   return CODING_CATEGORY_MASK_CCL;
2620 }
2621
2622 \f
2623 /*** 6. End-of-line handlers ***/
2624
2625 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2626
2627 static void
2628 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2629      struct coding_system *coding;
2630      unsigned char *source, *destination;
2631      int src_bytes, dst_bytes;
2632 {
2633   unsigned char *src = source;
2634   unsigned char *dst = destination;
2635   unsigned char *src_end = src + src_bytes;
2636   unsigned char *dst_end = dst + dst_bytes;
2637   Lisp_Object translation_table;
2638   /* SRC_BASE remembers the start position in source in each loop.
2639      The loop will be exited when there's not enough source code
2640      (within macro ONE_MORE_BYTE), or when there's not enough
2641      destination area to produce a character (within macro
2642      EMIT_CHAR).  */
2643   unsigned char *src_base;
2644   int c;
2645
2646   translation_table = Qnil;
2647   switch (coding->eol_type)
2648     {
2649     case CODING_EOL_CRLF:
2650       while (1)
2651         {
2652           src_base = src;
2653           ONE_MORE_BYTE (c);
2654           if (c == '\r')
2655             {
2656               ONE_MORE_BYTE (c);
2657               if (c != '\n')
2658                 {
2659                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2660                     {
2661                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2662                       goto label_end_of_loop;
2663                     }
2664                   src--;
2665                   c = '\r';
2666                 }
2667             }
2668           else if (c == '\n'
2669                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2670             {
2671               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2672               goto label_end_of_loop;
2673             }
2674           EMIT_CHAR (c);
2675         }
2676       break;
2677
2678     case CODING_EOL_CR:
2679       while (1)
2680         {
2681           src_base = src;
2682           ONE_MORE_BYTE (c);
2683           if (c == '\n')
2684             {
2685               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2686                 {
2687                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2688                   goto label_end_of_loop;
2689                 }
2690             }
2691           else if (c == '\r')
2692             c = '\n';
2693           EMIT_CHAR (c);
2694         }
2695       break;
2696
2697     default:                    /* no need for EOL handling */
2698       while (1)
2699         {
2700           src_base = src;
2701           ONE_MORE_BYTE (c);
2702           EMIT_CHAR (c);
2703         }
2704     }
2705
2706  label_end_of_loop:
2707   coding->consumed = coding->consumed_char = src_base - source;
2708   coding->produced = dst - destination;
2709   return;
2710 }
2711
2712 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2713    format of end-of-line according to `coding->eol_type'.  It also
2714    convert multibyte form 8-bit characers to unibyte if
2715    CODING->src_multibyte is nonzero.  If `coding->mode &
2716    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2717    also means end-of-line.  */
2718
2719 static void
2720 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2721      struct coding_system *coding;
2722      unsigned char *source, *destination;
2723      int src_bytes, dst_bytes;
2724 {
2725   unsigned char *src = source;
2726   unsigned char *dst = destination;
2727   unsigned char *src_end = src + src_bytes;
2728   unsigned char *dst_end = dst + dst_bytes;
2729   Lisp_Object translation_table;
2730   /* SRC_BASE remembers the start position in source in each loop.
2731      The loop will be exited when there's not enough source text to
2732      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2733      there's not enough destination area to produce encoded codes
2734      (within macro EMIT_BYTES).  */
2735   unsigned char *src_base;
2736   int c;
2737   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2738
2739   translation_table = Qnil;
2740   if (coding->src_multibyte
2741       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2742     {
2743       src_end--;
2744       src_bytes--;
2745       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2746     }
2747
2748   if (coding->eol_type == CODING_EOL_CRLF)
2749     {
2750       while (src < src_end)
2751         {
2752           src_base = src;
2753           c = *src++;
2754           if (c >= 0x20)
2755             EMIT_ONE_BYTE (c);
2756           else if (c == '\n' || (c == '\r' && selective_display))
2757             EMIT_TWO_BYTES ('\r', '\n');
2758           else
2759             EMIT_ONE_BYTE (c);
2760         }
2761       src_base = src;
2762     label_end_of_loop:
2763       ;
2764     }
2765   else
2766     {
2767       if (src_bytes <= dst_bytes)
2768         {
2769           safe_bcopy (src, dst, src_bytes);
2770           src_base = src_end;
2771           dst += src_bytes;
2772         }
2773       else
2774         {
2775           if (coding->src_multibyte
2776               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2777             dst_bytes--;
2778           safe_bcopy (src, dst, dst_bytes);
2779           src_base = src + dst_bytes;
2780           dst = destination + dst_bytes;
2781           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2782         }
2783       if (coding->eol_type == CODING_EOL_CR)
2784         {
2785           for (src = destination; src < dst; src++)
2786             if (*src == '\n') *src = '\r';
2787         }
2788       else if (selective_display)
2789         {
2790           for (src = destination; src < dst; src++)
2791             if (*src == '\r') *src = '\n';
2792         }
2793     }
2794   if (coding->src_multibyte)
2795     dst = destination + str_as_unibyte (destination, dst - destination);
2796
2797   coding->consumed = src_base - source;
2798   coding->produced = dst - destination;
2799 }
2800
2801 \f
2802 /*** 7. C library functions ***/
2803
2804 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2805    has a property `coding-system'.  The value of this property is a
2806    vector of length 5 (called as coding-vector).  Among elements of
2807    this vector, the first (element[0]) and the fifth (element[4])
2808    carry important information for decoding/encoding.  Before
2809    decoding/encoding, this information should be set in fields of a
2810    structure of type `coding_system'.
2811
2812    A value of property `coding-system' can be a symbol of another
2813    subsidiary coding-system.  In that case, Emacs gets coding-vector
2814    from that symbol.
2815
2816    `element[0]' contains information to be set in `coding->type'.  The
2817    value and its meaning is as follows:
2818
2819    0 -- coding_type_emacs_mule
2820    1 -- coding_type_sjis
2821    2 -- coding_type_iso2022
2822    3 -- coding_type_big5
2823    4 -- coding_type_ccl encoder/decoder written in CCL
2824    nil -- coding_type_no_conversion
2825    t -- coding_type_undecided (automatic conversion on decoding,
2826                                no-conversion on encoding)
2827
2828    `element[4]' contains information to be set in `coding->flags' and
2829    `coding->spec'.  The meaning varies by `coding->type'.
2830
2831    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2832    of length 32 (of which the first 13 sub-elements are used now).
2833    Meanings of these sub-elements are:
2834
2835    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2836         If the value is an integer of valid charset, the charset is
2837         assumed to be designated to graphic register N initially.
2838
2839         If the value is minus, it is a minus value of charset which
2840         reserves graphic register N, which means that the charset is
2841         not designated initially but should be designated to graphic
2842         register N just before encoding a character in that charset.
2843
2844         If the value is nil, graphic register N is never used on
2845         encoding.
2846
2847    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2848         Each value takes t or nil.  See the section ISO2022 of
2849         `coding.h' for more information.
2850
2851    If `coding->type' is `coding_type_big5', element[4] is t to denote
2852    BIG5-ETen or nil to denote BIG5-HKU.
2853
2854    If `coding->type' takes the other value, element[4] is ignored.
2855
2856    Emacs Lisp's coding system also carries information about format of
2857    end-of-line in a value of property `eol-type'.  If the value is
2858    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2859    means CODING_EOL_CR.  If it is not integer, it should be a vector
2860    of subsidiary coding systems of which property `eol-type' has one
2861    of above values.
2862
2863 */
2864
2865 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2866    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2867    is setup so that no conversion is necessary and return -1, else
2868    return 0.  */
2869
2870 int
2871 setup_coding_system (coding_system, coding)
2872      Lisp_Object coding_system;
2873      struct coding_system *coding;
2874 {
2875   Lisp_Object coding_spec, coding_type, eol_type, plist;
2876   Lisp_Object val;
2877   int i;
2878
2879   /* Initialize some fields required for all kinds of coding systems.  */
2880   coding->symbol = coding_system;
2881   coding->common_flags = 0;
2882   coding->mode = 0;
2883   coding->heading_ascii = -1;
2884   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2885   coding->composing = COMPOSITION_DISABLED;
2886   coding->cmp_data = NULL;
2887
2888   if (NILP (coding_system))
2889     goto label_invalid_coding_system;
2890
2891   coding_spec = Fget (coding_system, Qcoding_system);
2892
2893   if (!VECTORP (coding_spec)
2894       || XVECTOR (coding_spec)->size != 5
2895       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2896     goto label_invalid_coding_system;
2897
2898   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2899   if (VECTORP (eol_type))
2900     {
2901       coding->eol_type = CODING_EOL_UNDECIDED;
2902       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2903     }
2904   else if (XFASTINT (eol_type) == 1)
2905     {
2906       coding->eol_type = CODING_EOL_CRLF;
2907       coding->common_flags
2908         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2909     }
2910   else if (XFASTINT (eol_type) == 2)
2911     {
2912       coding->eol_type = CODING_EOL_CR;
2913       coding->common_flags
2914         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2915     }
2916   else
2917     coding->eol_type = CODING_EOL_LF;
2918
2919   coding_type = XVECTOR (coding_spec)->contents[0];
2920   /* Try short cut.  */
2921   if (SYMBOLP (coding_type))
2922     {
2923       if (EQ (coding_type, Qt))
2924         {
2925           coding->type = coding_type_undecided;
2926           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2927         }
2928       else
2929         coding->type = coding_type_no_conversion;
2930       return 0;
2931     }
2932
2933   /* Get values of coding system properties:
2934      `post-read-conversion', `pre-write-conversion',
2935      `translation-table-for-decode', `translation-table-for-encode'.  */
2936   plist = XVECTOR (coding_spec)->contents[3];
2937   /* Pre & post conversion functions should be disabled if
2938      inhibit_eol_conversion is nozero.  This is the case that a code
2939      conversion function is called while those functions are running.  */
2940   if (! inhibit_pre_post_conversion)
2941     {
2942       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2943       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2944     }
2945   val = Fplist_get (plist, Qtranslation_table_for_decode);
2946   if (SYMBOLP (val))
2947     val = Fget (val, Qtranslation_table_for_decode);
2948   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2949   val = Fplist_get (plist, Qtranslation_table_for_encode);
2950   if (SYMBOLP (val))
2951     val = Fget (val, Qtranslation_table_for_encode);
2952   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2953   val = Fplist_get (plist, Qcoding_category);
2954   if (!NILP (val))
2955     {
2956       val = Fget (val, Qcoding_category_index);
2957       if (INTEGERP (val))
2958         coding->category_idx = XINT (val);
2959       else
2960         goto label_invalid_coding_system;
2961     }
2962   else
2963     goto label_invalid_coding_system;
2964
2965   val = Fplist_get (plist, Qsafe_charsets);
2966   if (EQ (val, Qt))
2967     {
2968       for (i = 0; i <= MAX_CHARSET; i++)
2969         coding->safe_charsets[i] = 1;
2970     }
2971   else
2972     {
2973       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2974       while (CONSP (val))
2975         {
2976           if ((i = get_charset_id (XCAR (val))) >= 0)
2977             coding->safe_charsets[i] = 1;
2978           val = XCDR (val);
2979         }
2980     }
2981
2982   /* If the coding system has non-nil `composition' property, enable
2983      composition handling.  */
2984   val = Fplist_get (plist, Qcomposition);
2985   if (!NILP (val))
2986     coding->composing = COMPOSITION_NO;
2987
2988   switch (XFASTINT (coding_type))
2989     {
2990     case 0:
2991       coding->type = coding_type_emacs_mule;
2992       if (!NILP (coding->post_read_conversion))
2993         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2994       if (!NILP (coding->pre_write_conversion))
2995         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2996       break;
2997
2998     case 1:
2999       coding->type = coding_type_sjis;
3000       coding->common_flags
3001         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3002       break;
3003
3004     case 2:
3005       coding->type = coding_type_iso2022;
3006       coding->common_flags
3007         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3008       {
3009         Lisp_Object val, temp;
3010         Lisp_Object *flags;
3011         int i, charset, reg_bits = 0;
3012
3013         val = XVECTOR (coding_spec)->contents[4];
3014
3015         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3016           goto label_invalid_coding_system;
3017
3018         flags = XVECTOR (val)->contents;
3019         coding->flags
3020           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3021              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3022              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3023              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3024              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3025              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3026              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3027              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3028              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3029              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3030              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3031              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3032              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3033              );
3034
3035         /* Invoke graphic register 0 to plane 0.  */
3036         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3037         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3038         CODING_SPEC_ISO_INVOCATION (coding, 1)
3039           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3040         /* Not single shifting at first.  */
3041         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3042         /* Beginning of buffer should also be regarded as bol. */
3043         CODING_SPEC_ISO_BOL (coding) = 1;
3044
3045         for (charset = 0; charset <= MAX_CHARSET; charset++)
3046           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3047         val = Vcharset_revision_alist;
3048         while (CONSP (val))
3049           {
3050             charset = get_charset_id (Fcar_safe (XCAR (val)));
3051             if (charset >= 0
3052                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3053                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3054               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3055             val = XCDR (val);
3056           }
3057
3058         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3059            FLAGS[REG] can be one of below:
3060                 integer CHARSET: CHARSET occupies register I,
3061                 t: designate nothing to REG initially, but can be used
3062                   by any charsets,
3063                 list of integer, nil, or t: designate the first
3064                   element (if integer) to REG initially, the remaining
3065                   elements (if integer) is designated to REG on request,
3066                   if an element is t, REG can be used by any charsets,
3067                 nil: REG is never used.  */
3068         for (charset = 0; charset <= MAX_CHARSET; charset++)
3069           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3070             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3071         for (i = 0; i < 4; i++)
3072           {
3073             if (INTEGERP (flags[i])
3074                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3075                 || (charset = get_charset_id (flags[i])) >= 0)
3076               {
3077                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3078                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3079               }
3080             else if (EQ (flags[i], Qt))
3081               {
3082                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3083                 reg_bits |= 1 << i;
3084                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3085               }
3086             else if (CONSP (flags[i]))
3087               {
3088                 Lisp_Object tail;
3089                 tail = flags[i];
3090
3091                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3092                 if (INTEGERP (XCAR (tail))
3093                     && (charset = XINT (XCAR (tail)),
3094                         CHARSET_VALID_P (charset))
3095                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3096                   {
3097                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3098                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3099                   }
3100                 else
3101                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3102                 tail = XCDR (tail);
3103                 while (CONSP (tail))
3104                   {
3105                     if (INTEGERP (XCAR (tail))
3106                         && (charset = XINT (XCAR (tail)),
3107                             CHARSET_VALID_P (charset))
3108                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3109                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3110                         = i;
3111                     else if (EQ (XCAR (tail), Qt))
3112                       reg_bits |= 1 << i;
3113                     tail = XCDR (tail);
3114                   }
3115               }
3116             else
3117               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3118
3119             CODING_SPEC_ISO_DESIGNATION (coding, i)
3120               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3121           }
3122
3123         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3124           {
3125             /* REG 1 can be used only by locking shift in 7-bit env.  */
3126             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3127               reg_bits &= ~2;
3128             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3129               /* Without any shifting, only REG 0 and 1 can be used.  */
3130               reg_bits &= 3;
3131           }
3132
3133         if (reg_bits)
3134           for (charset = 0; charset <= MAX_CHARSET; charset++)
3135             {
3136               if (CHARSET_VALID_P (charset))
3137                 {
3138                   /* There exist some default graphic registers to be
3139                      used CHARSET.  */
3140
3141                   /* We had better avoid designating a charset of
3142                      CHARS96 to REG 0 as far as possible.  */
3143                   if (CHARSET_CHARS (charset) == 96)
3144                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3145                       = (reg_bits & 2
3146                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3147                   else
3148                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3149                       = (reg_bits & 1
3150                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3151                 }
3152             }
3153       }
3154       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3155       coding->spec.iso2022.last_invalid_designation_register = -1;
3156       break;
3157
3158     case 3:
3159       coding->type = coding_type_big5;
3160       coding->common_flags
3161         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3162       coding->flags
3163         = (NILP (XVECTOR (coding_spec)->contents[4])
3164            ? CODING_FLAG_BIG5_HKU
3165            : CODING_FLAG_BIG5_ETEN);
3166       break;
3167
3168     case 4:
3169       coding->type = coding_type_ccl;
3170       coding->common_flags
3171         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3172       {
3173         val = XVECTOR (coding_spec)->contents[4];
3174         if (! CONSP (val)
3175             || setup_ccl_program (&(coding->spec.ccl.decoder),
3176                                   XCAR (val)) < 0
3177             || setup_ccl_program (&(coding->spec.ccl.encoder),
3178                                   XCDR (val)) < 0)
3179           goto label_invalid_coding_system;
3180
3181         bzero (coding->spec.ccl.valid_codes, 256);
3182         val = Fplist_get (plist, Qvalid_codes);
3183         if (CONSP (val))
3184           {
3185             Lisp_Object this;
3186
3187             for (; CONSP (val); val = XCDR (val))
3188               {
3189                 this = XCAR (val);
3190                 if (INTEGERP (this)
3191                     && XINT (this) >= 0 && XINT (this) < 256)
3192                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3193                 else if (CONSP (this)
3194                          && INTEGERP (XCAR (this))
3195                          && INTEGERP (XCDR (this)))
3196                   {
3197                     int start = XINT (XCAR (this));
3198                     int end = XINT (XCDR (this));
3199
3200                     if (start >= 0 && start <= end && end < 256)
3201                       while (start <= end)
3202                         coding->spec.ccl.valid_codes[start++] = 1;
3203                   }
3204               }
3205           }
3206       }
3207       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3208       break;
3209
3210     case 5:
3211       coding->type = coding_type_raw_text;
3212       break;
3213
3214     default:
3215       goto label_invalid_coding_system;
3216     }
3217   return 0;
3218
3219  label_invalid_coding_system:
3220   coding->type = coding_type_no_conversion;
3221   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3222   coding->common_flags = 0;
3223   coding->eol_type = CODING_EOL_LF;
3224   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3225   return -1;
3226 }
3227
3228 /* Free memory blocks allocated for storing composition information.  */
3229
3230 void
3231 coding_free_composition_data (coding)
3232      struct coding_system *coding;
3233 {
3234   struct composition_data *cmp_data = coding->cmp_data, *next;
3235
3236   if (!cmp_data)
3237     return;
3238   /* Memory blocks are chained.  At first, rewind to the first, then,
3239      free blocks one by one.  */
3240   while (cmp_data->prev)
3241     cmp_data = cmp_data->prev;
3242   while (cmp_data)
3243     {
3244       next = cmp_data->next;
3245       xfree (cmp_data);
3246       cmp_data = next;
3247     }
3248   coding->cmp_data = NULL;
3249 }
3250
3251 /* Set `char_offset' member of all memory blocks pointed by
3252    coding->cmp_data to POS.  */
3253
3254 void
3255 coding_adjust_composition_offset (coding, pos)
3256      struct coding_system *coding;
3257      int pos;
3258 {
3259   struct composition_data *cmp_data;
3260
3261   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3262     cmp_data->char_offset = pos;
3263 }
3264
3265 /* Setup raw-text or one of its subsidiaries in the structure
3266    coding_system CODING according to the already setup value eol_type
3267    in CODING.  CODING should be setup for some coding system in
3268    advance.  */
3269
3270 void
3271 setup_raw_text_coding_system (coding)
3272      struct coding_system *coding;
3273 {
3274   if (coding->type != coding_type_raw_text)
3275     {
3276       coding->symbol = Qraw_text;
3277       coding->type = coding_type_raw_text;
3278       if (coding->eol_type != CODING_EOL_UNDECIDED)
3279         {
3280           Lisp_Object subsidiaries;
3281           subsidiaries = Fget (Qraw_text, Qeol_type);
3282
3283           if (VECTORP (subsidiaries)
3284               && XVECTOR (subsidiaries)->size == 3)
3285             coding->symbol
3286               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3287         }
3288       setup_coding_system (coding->symbol, coding);
3289     }
3290   return;
3291 }
3292
3293 /* Emacs has a mechanism to automatically detect a coding system if it
3294    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3295    it's impossible to distinguish some coding systems accurately
3296    because they use the same range of codes.  So, at first, coding
3297    systems are categorized into 7, those are:
3298
3299    o coding-category-emacs-mule
3300
3301         The category for a coding system which has the same code range
3302         as Emacs' internal format.  Assigned the coding-system (Lisp
3303         symbol) `emacs-mule' by default.
3304
3305    o coding-category-sjis
3306
3307         The category for a coding system which has the same code range
3308         as SJIS.  Assigned the coding-system (Lisp
3309         symbol) `japanese-shift-jis' by default.
3310
3311    o coding-category-iso-7
3312
3313         The category for a coding system which has the same code range
3314         as ISO2022 of 7-bit environment.  This doesn't use any locking
3315         shift and single shift functions.  This can encode/decode all
3316         charsets.  Assigned the coding-system (Lisp symbol)
3317         `iso-2022-7bit' by default.
3318
3319    o coding-category-iso-7-tight
3320
3321         Same as coding-category-iso-7 except that this can
3322         encode/decode only the specified charsets.
3323
3324    o coding-category-iso-8-1
3325
3326         The category for a coding system which has the same code range
3327         as ISO2022 of 8-bit environment and graphic plane 1 used only
3328         for DIMENSION1 charset.  This doesn't use any locking shift
3329         and single shift functions.  Assigned the coding-system (Lisp
3330         symbol) `iso-latin-1' by default.
3331
3332    o coding-category-iso-8-2
3333
3334         The category for a coding system which has the same code range
3335         as ISO2022 of 8-bit environment and graphic plane 1 used only
3336         for DIMENSION2 charset.  This doesn't use any locking shift
3337         and single shift functions.  Assigned the coding-system (Lisp
3338         symbol) `japanese-iso-8bit' by default.
3339
3340    o coding-category-iso-7-else
3341
3342         The category for a coding system which has the same code range
3343         as ISO2022 of 7-bit environemnt but uses locking shift or
3344         single shift functions.  Assigned the coding-system (Lisp
3345         symbol) `iso-2022-7bit-lock' by default.
3346
3347    o coding-category-iso-8-else
3348
3349         The category for a coding system which has the same code range
3350         as ISO2022 of 8-bit environemnt but uses locking shift or
3351         single shift functions.  Assigned the coding-system (Lisp
3352         symbol) `iso-2022-8bit-ss2' by default.
3353
3354    o coding-category-big5
3355
3356         The category for a coding system which has the same code range
3357         as BIG5.  Assigned the coding-system (Lisp symbol)
3358         `cn-big5' by default.
3359
3360    o coding-category-utf-8
3361
3362         The category for a coding system which has the same code range
3363         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3364         symbol) `utf-8' by default.
3365
3366    o coding-category-utf-16-be
3367
3368         The category for a coding system in which a text has an
3369         Unicode signature (cf. Unicode Standard) in the order of BIG
3370         endian at the head.  Assigned the coding-system (Lisp symbol)
3371         `utf-16-be' by default.
3372
3373    o coding-category-utf-16-le
3374
3375         The category for a coding system in which a text has an
3376         Unicode signature (cf. Unicode Standard) in the order of
3377         LITTLE endian at the head.  Assigned the coding-system (Lisp
3378         symbol) `utf-16-le' by default.
3379
3380    o coding-category-ccl
3381
3382         The category for a coding system of which encoder/decoder is
3383         written in CCL programs.  The default value is nil, i.e., no
3384         coding system is assigned.
3385
3386    o coding-category-binary
3387
3388         The category for a coding system not categorized in any of the
3389         above.  Assigned the coding-system (Lisp symbol)
3390         `no-conversion' by default.
3391
3392    Each of them is a Lisp symbol and the value is an actual
3393    `coding-system's (this is also a Lisp symbol) assigned by a user.
3394    What Emacs does actually is to detect a category of coding system.
3395    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3396    decide only one possible category, it selects a category of the
3397    highest priority.  Priorities of categories are also specified by a
3398    user in a Lisp variable `coding-category-list'.
3399
3400 */
3401
3402 static
3403 int ascii_skip_code[256];
3404
3405 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3406    If it detects possible coding systems, return an integer in which
3407    appropriate flag bits are set.  Flag bits are defined by macros
3408    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3409    it should point the table `coding_priorities'.  In that case, only
3410    the flag bit for a coding system of the highest priority is set in
3411    the returned value.
3412
3413    How many ASCII characters are at the head is returned as *SKIP.  */
3414
3415 static int
3416 detect_coding_mask (source, src_bytes, priorities, skip)
3417      unsigned char *source;
3418      int src_bytes, *priorities, *skip;
3419 {
3420   register unsigned char c;
3421   unsigned char *src = source, *src_end = source + src_bytes;
3422   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3423   int i, idx;
3424
3425   /* At first, skip all ASCII characters and control characters except
3426      for three ISO2022 specific control characters.  */
3427   ascii_skip_code[ISO_CODE_SO] = 0;
3428   ascii_skip_code[ISO_CODE_SI] = 0;
3429   ascii_skip_code[ISO_CODE_ESC] = 0;
3430
3431  label_loop_detect_coding:
3432   while (src < src_end && ascii_skip_code[*src]) src++;
3433   *skip = src - source;
3434
3435   if (src >= src_end)
3436     /* We found nothing other than ASCII.  There's nothing to do.  */
3437     return 0;
3438
3439   c = *src;
3440   /* The text seems to be encoded in some multilingual coding system.
3441      Now, try to find in which coding system the text is encoded.  */
3442   if (c < 0x80)
3443     {
3444       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3445       /* C is an ISO2022 specific control code of C0.  */
3446       mask = detect_coding_iso2022 (src, src_end);
3447       if (mask == 0)
3448         {
3449           /* No valid ISO2022 code follows C.  Try again.  */
3450           src++;
3451           if (c == ISO_CODE_ESC)
3452             ascii_skip_code[ISO_CODE_ESC] = 1;
3453           else
3454             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3455           goto label_loop_detect_coding;
3456         }
3457       if (priorities)
3458         {
3459           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3460             {
3461               if (mask & priorities[i])
3462                 return priorities[i];
3463             }
3464           return CODING_CATEGORY_MASK_RAW_TEXT;
3465         }
3466     }
3467   else
3468     {
3469       int try;
3470
3471       if (c < 0xA0)
3472         {
3473           /* C is the first byte of SJIS character code,
3474              or a leading-code of Emacs' internal format (emacs-mule),
3475              or the first byte of UTF-16.  */
3476           try = (CODING_CATEGORY_MASK_SJIS
3477                   | CODING_CATEGORY_MASK_EMACS_MULE
3478                   | CODING_CATEGORY_MASK_UTF_16_BE
3479                   | CODING_CATEGORY_MASK_UTF_16_LE);
3480
3481           /* Or, if C is a special latin extra code,
3482              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3483              or is an ISO2022 control-sequence-introducer (CSI),
3484              we should also consider the possibility of ISO2022 codings.  */
3485           if ((VECTORP (Vlatin_extra_code_table)
3486                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3487               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3488               || (c == ISO_CODE_CSI
3489                   && (src < src_end
3490                       && (*src == ']'
3491                           || ((*src == '0' || *src == '1' || *src == '2')
3492                               && src + 1 < src_end
3493                               && src[1] == ']')))))
3494             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3495                      | CODING_CATEGORY_MASK_ISO_8BIT);
3496         }
3497       else
3498         /* C is a character of ISO2022 in graphic plane right,
3499            or a SJIS's 1-byte character code (i.e. JISX0201),
3500            or the first byte of BIG5's 2-byte code,
3501            or the first byte of UTF-8/16.  */
3502         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3503                 | CODING_CATEGORY_MASK_ISO_8BIT
3504                 | CODING_CATEGORY_MASK_SJIS
3505                 | CODING_CATEGORY_MASK_BIG5
3506                 | CODING_CATEGORY_MASK_UTF_8
3507                 | CODING_CATEGORY_MASK_UTF_16_BE
3508                 | CODING_CATEGORY_MASK_UTF_16_LE);
3509
3510       /* Or, we may have to consider the possibility of CCL.  */
3511       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3512           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3513               ->spec.ccl.valid_codes)[c])
3514         try |= CODING_CATEGORY_MASK_CCL;
3515
3516       mask = 0;
3517       utf16_examined_p = iso2022_examined_p = 0;
3518       if (priorities)
3519         {
3520           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3521             {
3522               if (!iso2022_examined_p
3523                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3524                 {
3525                   mask |= detect_coding_iso2022 (src, src_end);
3526                   iso2022_examined_p = 1;
3527                 }
3528               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3529                 mask |= detect_coding_sjis (src, src_end);
3530               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3531                 mask |= detect_coding_utf_8 (src, src_end);
3532               else if (!utf16_examined_p
3533                        && (priorities[i] & try &
3534                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3535                 {
3536                   mask |= detect_coding_utf_16 (src, src_end);
3537                   utf16_examined_p = 1;
3538                 }
3539               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3540                 mask |= detect_coding_big5 (src, src_end);
3541               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3542                 mask |= detect_coding_emacs_mule (src, src_end);
3543               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3544                 mask |= detect_coding_ccl (src, src_end);
3545               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3546                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3547               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3548                 mask |= CODING_CATEGORY_MASK_BINARY;
3549               if (mask & priorities[i])
3550                 return priorities[i];
3551             }
3552           return CODING_CATEGORY_MASK_RAW_TEXT;
3553         }
3554       if (try & CODING_CATEGORY_MASK_ISO)
3555         mask |= detect_coding_iso2022 (src, src_end);
3556       if (try & CODING_CATEGORY_MASK_SJIS)
3557         mask |= detect_coding_sjis (src, src_end);
3558       if (try & CODING_CATEGORY_MASK_BIG5)
3559         mask |= detect_coding_big5 (src, src_end);
3560       if (try & CODING_CATEGORY_MASK_UTF_8)
3561         mask |= detect_coding_utf_8 (src, src_end);
3562       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3563         mask |= detect_coding_utf_16 (src, src_end);
3564       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3565         mask |= detect_coding_emacs_mule (src, src_end);
3566       if (try & CODING_CATEGORY_MASK_CCL)
3567         mask |= detect_coding_ccl (src, src_end);
3568     }
3569   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3570 }
3571
3572 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3573    The information of the detected coding system is set in CODING.  */
3574
3575 void
3576 detect_coding (coding, src, src_bytes)
3577      struct coding_system *coding;
3578      unsigned char *src;
3579      int src_bytes;
3580 {
3581   unsigned int idx;
3582   int skip, mask, i;
3583   Lisp_Object val;
3584
3585   val = Vcoding_category_list;
3586   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3587   coding->heading_ascii = skip;
3588
3589   if (!mask) return;
3590
3591   /* We found a single coding system of the highest priority in MASK.  */
3592   idx = 0;
3593   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3594   if (! mask)
3595     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3596
3597   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3598
3599   if (coding->eol_type != CODING_EOL_UNDECIDED)
3600     {
3601       Lisp_Object tmp;
3602
3603       tmp = Fget (val, Qeol_type);
3604       if (VECTORP (tmp))
3605         val = XVECTOR (tmp)->contents[coding->eol_type];
3606     }
3607
3608   /* Setup this new coding system while preserving some slots.  */
3609   {
3610     int src_multibyte = coding->src_multibyte;
3611     int dst_multibyte = coding->dst_multibyte;
3612
3613     setup_coding_system (val, coding);
3614     coding->src_multibyte = src_multibyte;
3615     coding->dst_multibyte = dst_multibyte;
3616     coding->heading_ascii = skip;
3617   }
3618 }
3619
3620 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3621    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3622    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3623
3624    How many non-eol characters are at the head is returned as *SKIP.  */
3625
3626 #define MAX_EOL_CHECK_COUNT 3
3627
3628 static int
3629 detect_eol_type (source, src_bytes, skip)
3630      unsigned char *source;
3631      int src_bytes, *skip;
3632 {
3633   unsigned char *src = source, *src_end = src + src_bytes;
3634   unsigned char c;
3635   int total = 0;                /* How many end-of-lines are found so far.  */
3636   int eol_type = CODING_EOL_UNDECIDED;
3637   int this_eol_type;
3638
3639   *skip = 0;
3640
3641   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3642     {
3643       c = *src++;
3644       if (c == '\n' || c == '\r')
3645         {
3646           if (*skip == 0)
3647             *skip = src - 1 - source;
3648           total++;
3649           if (c == '\n')
3650             this_eol_type = CODING_EOL_LF;
3651           else if (src >= src_end || *src != '\n')
3652             this_eol_type = CODING_EOL_CR;
3653           else
3654             this_eol_type = CODING_EOL_CRLF, src++;
3655
3656           if (eol_type == CODING_EOL_UNDECIDED)
3657             /* This is the first end-of-line.  */
3658             eol_type = this_eol_type;
3659           else if (eol_type != this_eol_type)
3660             {
3661               /* The found type is different from what found before.  */
3662               eol_type = CODING_EOL_INCONSISTENT;
3663               break;
3664             }
3665         }
3666     }
3667
3668   if (*skip == 0)
3669     *skip = src_end - source;
3670   return eol_type;
3671 }
3672
3673 /* Like detect_eol_type, but detect EOL type in 2-octet
3674    big-endian/little-endian format for coding systems utf-16-be and
3675    utf-16-le.  */
3676
3677 static int
3678 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3679      unsigned char *source;
3680      int src_bytes, *skip;
3681 {
3682   unsigned char *src = source, *src_end = src + src_bytes;
3683   unsigned int c1, c2;
3684   int total = 0;                /* How many end-of-lines are found so far.  */
3685   int eol_type = CODING_EOL_UNDECIDED;
3686   int this_eol_type;
3687   int msb, lsb;
3688
3689   if (big_endian_p)
3690     msb = 0, lsb = 1;
3691   else
3692     msb = 1, lsb = 0;
3693
3694   *skip = 0;
3695
3696   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3697     {
3698       c1 = (src[msb] << 8) | (src[lsb]);
3699       src += 2;
3700
3701       if (c1 == '\n' || c1 == '\r')
3702         {
3703           if (*skip == 0)
3704             *skip = src - 2 - source;
3705           total++;
3706           if (c1 == '\n')
3707             {
3708               this_eol_type = CODING_EOL_LF;
3709             }
3710           else
3711             {
3712               if ((src + 1) >= src_end)
3713                 {
3714                   this_eol_type = CODING_EOL_CR;
3715                 }
3716               else
3717                 {
3718                   c2 = (src[msb] << 8) | (src[lsb]);
3719                   if (c2 == '\n')
3720                     this_eol_type = CODING_EOL_CRLF, src += 2;
3721                   else
3722                     this_eol_type = CODING_EOL_CR;
3723                 }
3724             }
3725
3726           if (eol_type == CODING_EOL_UNDECIDED)
3727             /* This is the first end-of-line.  */
3728             eol_type = this_eol_type;
3729           else if (eol_type != this_eol_type)
3730             {
3731               /* The found type is different from what found before.  */
3732               eol_type = CODING_EOL_INCONSISTENT;
3733               break;
3734             }
3735         }
3736     }
3737
3738   if (*skip == 0)
3739     *skip = src_end - source;
3740   return eol_type;
3741 }
3742
3743 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3744    is encoded.  If it detects an appropriate format of end-of-line, it
3745    sets the information in *CODING.  */
3746
3747 void
3748 detect_eol (coding, src, src_bytes)
3749      struct coding_system *coding;
3750      unsigned char *src;
3751      int src_bytes;
3752 {
3753   Lisp_Object val;
3754   int skip;
3755   int eol_type;
3756
3757   switch (coding->category_idx)
3758     {
3759     case CODING_CATEGORY_IDX_UTF_16_BE:
3760       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3761       break;
3762     case CODING_CATEGORY_IDX_UTF_16_LE:
3763       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3764       break;
3765     default:
3766       eol_type = detect_eol_type (src, src_bytes, &skip);
3767       break;
3768     }
3769
3770   if (coding->heading_ascii > skip)
3771     coding->heading_ascii = skip;
3772   else
3773     skip = coding->heading_ascii;
3774
3775   if (eol_type == CODING_EOL_UNDECIDED)
3776     return;
3777   if (eol_type == CODING_EOL_INCONSISTENT)
3778     {
3779 #if 0
3780       /* This code is suppressed until we find a better way to
3781          distinguish raw text file and binary file.  */
3782
3783       /* If we have already detected that the coding is raw-text, the
3784          coding should actually be no-conversion.  */
3785       if (coding->type == coding_type_raw_text)
3786         {
3787           setup_coding_system (Qno_conversion, coding);
3788           return;
3789         }
3790       /* Else, let's decode only text code anyway.  */
3791 #endif /* 0 */
3792       eol_type = CODING_EOL_LF;
3793     }
3794
3795   val = Fget (coding->symbol, Qeol_type);
3796   if (VECTORP (val) && XVECTOR (val)->size == 3)
3797     {
3798       int src_multibyte = coding->src_multibyte;
3799       int dst_multibyte = coding->dst_multibyte;
3800
3801       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3802       coding->src_multibyte = src_multibyte;
3803       coding->dst_multibyte = dst_multibyte;
3804       coding->heading_ascii = skip;
3805     }
3806 }
3807
3808 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3809
3810 #define DECODING_BUFFER_MAG(coding)                     \
3811   (coding->type == coding_type_iso2022                  \
3812    ? 3                                                  \
3813    : (coding->type == coding_type_ccl                   \
3814       ? coding->spec.ccl.decoder.buf_magnification      \
3815       : 2))
3816
3817 /* Return maximum size (bytes) of a buffer enough for decoding
3818    SRC_BYTES of text encoded in CODING.  */
3819
3820 int
3821 decoding_buffer_size (coding, src_bytes)
3822      struct coding_system *coding;
3823      int src_bytes;
3824 {
3825   return (src_bytes * DECODING_BUFFER_MAG (coding)
3826           + CONVERSION_BUFFER_EXTRA_ROOM);
3827 }
3828
3829 /* Return maximum size (bytes) of a buffer enough for encoding
3830    SRC_BYTES of text to CODING.  */
3831
3832 int
3833 encoding_buffer_size (coding, src_bytes)
3834      struct coding_system *coding;
3835      int src_bytes;
3836 {
3837   int magnification;
3838
3839   if (coding->type == coding_type_ccl)
3840     magnification = coding->spec.ccl.encoder.buf_magnification;
3841   else if (CODING_REQUIRE_ENCODING (coding))
3842     magnification = 3;
3843   else
3844     magnification = 1;
3845
3846   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3847 }
3848
3849 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3850 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3851 #endif
3852
3853 char *conversion_buffer;
3854 int conversion_buffer_size;
3855
3856 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3857    or decoding.  Sufficient memory is allocated automatically.  If we
3858    run out of memory, return NULL.  */
3859
3860 char *
3861 get_conversion_buffer (size)
3862      int size;
3863 {
3864   if (size > conversion_buffer_size)
3865     {
3866       char *buf;
3867       int real_size = conversion_buffer_size * 2;
3868
3869       while (real_size < size) real_size *= 2;
3870       buf = (char *) xmalloc (real_size);
3871       xfree (conversion_buffer);
3872       conversion_buffer = buf;
3873       conversion_buffer_size = real_size;
3874     }
3875   return conversion_buffer;
3876 }
3877
3878 int
3879 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3880      struct coding_system *coding;
3881      unsigned char *source, *destination;
3882      int src_bytes, dst_bytes, encodep;
3883 {
3884   struct ccl_program *ccl
3885     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3886   int result;
3887
3888   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3889
3890   coding->produced = ccl_driver (ccl, source, destination,
3891                                  src_bytes, dst_bytes, &(coding->consumed));
3892   if (encodep)
3893     coding->produced_char = coding->produced;
3894   else
3895     {
3896       int bytes
3897         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3898       coding->produced = str_as_multibyte (destination, bytes,
3899                                            coding->produced,
3900                                            &(coding->produced_char));
3901     }
3902
3903   switch (ccl->status)
3904     {
3905     case CCL_STAT_SUSPEND_BY_SRC:
3906       result = CODING_FINISH_INSUFFICIENT_SRC;
3907       break;
3908     case CCL_STAT_SUSPEND_BY_DST:
3909       result = CODING_FINISH_INSUFFICIENT_DST;
3910       break;
3911     case CCL_STAT_QUIT:
3912     case CCL_STAT_INVALID_CMD:
3913       result = CODING_FINISH_INTERRUPT;
3914       break;
3915     default:
3916       result = CODING_FINISH_NORMAL;
3917       break;
3918     }
3919   return result;
3920 }
3921
3922 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
3923    decoding, it may detect coding system and format of end-of-line if
3924    those are not yet decided.  The source should be unibyte, the
3925    result is multibyte if CODING->dst_multibyte is nonzero, else
3926    unibyte.  */
3927
3928 int
3929 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3930      struct coding_system *coding;
3931      unsigned char *source, *destination;
3932      int src_bytes, dst_bytes;
3933 {
3934   if (coding->type == coding_type_undecided)
3935     detect_coding (coding, source, src_bytes);
3936
3937   if (coding->eol_type == CODING_EOL_UNDECIDED)
3938     detect_eol (coding, source, src_bytes);
3939
3940   coding->produced = coding->produced_char = 0;
3941   coding->consumed = coding->consumed_char = 0;
3942   coding->errors = 0;
3943   coding->result = CODING_FINISH_NORMAL;
3944
3945   switch (coding->type)
3946     {
3947     case coding_type_sjis:
3948       decode_coding_sjis_big5 (coding, source, destination,
3949                                src_bytes, dst_bytes, 1);
3950       break;
3951
3952     case coding_type_iso2022:
3953       decode_coding_iso2022 (coding, source, destination,
3954                              src_bytes, dst_bytes);
3955       break;
3956
3957     case coding_type_big5:
3958       decode_coding_sjis_big5 (coding, source, destination,
3959                                src_bytes, dst_bytes, 0);
3960       break;
3961
3962     case coding_type_emacs_mule:
3963       decode_coding_emacs_mule (coding, source, destination,
3964                                 src_bytes, dst_bytes);
3965       break;
3966
3967     case coding_type_ccl:
3968       ccl_coding_driver (coding, source, destination,
3969                          src_bytes, dst_bytes, 0);
3970       break;
3971
3972     default:
3973       decode_eol (coding, source, destination, src_bytes, dst_bytes);
3974     }
3975
3976   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
3977       && coding->consumed == src_bytes)
3978     coding->result = CODING_FINISH_NORMAL;
3979
3980   if (coding->mode & CODING_MODE_LAST_BLOCK
3981       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
3982     {
3983       unsigned char *src = source + coding->consumed;
3984       unsigned char *dst = destination + coding->produced;
3985
3986       src_bytes -= coding->consumed;
3987      coding->errors++;
3988       if (COMPOSING_P (coding))
3989         DECODE_COMPOSITION_END ('1');
3990       while (src_bytes--)
3991         {
3992           int c = *src++;
3993           dst += CHAR_STRING (c, dst);
3994           coding->produced_char++;
3995         }
3996       coding->consumed = coding->consumed_char = src - source;
3997       coding->produced = dst - destination;
3998     }
3999
4000   if (!coding->dst_multibyte)
4001     {
4002       coding->produced = str_as_unibyte (destination, coding->produced);
4003       coding->produced_char = coding->produced;
4004     }
4005
4006   return coding->result;
4007 }
4008
4009 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4010    multibyteness of the source is CODING->src_multibyte, the
4011    multibyteness of the result is always unibyte.  */
4012
4013 int
4014 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4015      struct coding_system *coding;
4016      unsigned char *source, *destination;
4017      int src_bytes, dst_bytes;
4018 {
4019   coding->produced = coding->produced_char = 0;
4020   coding->consumed = coding->consumed_char = 0;
4021   coding->errors = 0;
4022   coding->result = CODING_FINISH_NORMAL;
4023
4024   switch (coding->type)
4025     {
4026     case coding_type_sjis:
4027       encode_coding_sjis_big5 (coding, source, destination,
4028                                src_bytes, dst_bytes, 1);
4029       break;
4030
4031     case coding_type_iso2022:
4032       encode_coding_iso2022 (coding, source, destination,
4033                              src_bytes, dst_bytes);
4034       break;
4035
4036     case coding_type_big5:
4037       encode_coding_sjis_big5 (coding, source, destination,
4038                                src_bytes, dst_bytes, 0);
4039       break;
4040
4041     case coding_type_emacs_mule:
4042       encode_coding_emacs_mule (coding, source, destination,
4043                                 src_bytes, dst_bytes);
4044       break;
4045
4046     case coding_type_ccl:
4047       ccl_coding_driver (coding, source, destination,
4048                          src_bytes, dst_bytes, 1);
4049       break;
4050
4051     default:
4052       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4053     }
4054
4055   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4056       && coding->consumed == src_bytes)
4057     coding->result = CODING_FINISH_NORMAL;
4058
4059   if (coding->mode & CODING_MODE_LAST_BLOCK)
4060     {
4061       unsigned char *src = source + coding->consumed;
4062       unsigned char *src_end = src + src_bytes;
4063       unsigned char *dst = destination + coding->produced;
4064
4065       if (coding->type == coding_type_iso2022)
4066         ENCODE_RESET_PLANE_AND_REGISTER;
4067       if (COMPOSING_P (coding))
4068         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4069       if (coding->consumed < src_bytes)
4070         {
4071           int len = src_bytes - coding->consumed;
4072
4073           BCOPY_SHORT (source + coding->consumed, dst, len);
4074           if (coding->src_multibyte)
4075             len = str_as_unibyte (dst, len);
4076           dst += len;
4077           coding->consumed = src_bytes;
4078         }
4079       coding->produced = coding->produced_char = dst - destination;
4080     }
4081
4082   return coding->result;
4083 }
4084
4085 /* Scan text in the region between *BEG and *END (byte positions),
4086    skip characters which we don't have to decode by coding system
4087    CODING at the head and tail, then set *BEG and *END to the region
4088    of the text we actually have to convert.  The caller should move
4089    the gap out of the region in advance if the region is from a
4090    buffer.
4091
4092    If STR is not NULL, *BEG and *END are indices into STR.  */
4093
4094 static void
4095 shrink_decoding_region (beg, end, coding, str)
4096      int *beg, *end;
4097      struct coding_system *coding;
4098      unsigned char *str;
4099 {
4100   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4101   int eol_conversion;
4102   Lisp_Object translation_table;
4103
4104   if (coding->type == coding_type_ccl
4105       || coding->type == coding_type_undecided
4106       || coding->eol_type != CODING_EOL_LF
4107       || !NILP (coding->post_read_conversion)
4108       || coding->composing != COMPOSITION_DISABLED)
4109     {
4110       /* We can't skip any data.  */
4111       return;
4112     }
4113   if (coding->type == coding_type_no_conversion
4114       || coding->type == coding_type_raw_text
4115       || coding->type == coding_type_emacs_mule)
4116     {
4117       /* We need no conversion, but don't have to skip any data here.
4118          Decoding routine handles them effectively anyway.  */
4119       return;
4120     }
4121
4122   translation_table = coding->translation_table_for_decode;
4123   if (NILP (translation_table) && !NILP (Venable_character_translation))
4124     translation_table = Vstandard_translation_table_for_decode;
4125   if (CHAR_TABLE_P (translation_table))
4126     {
4127       int i;
4128       for (i = 0; i < 128; i++)
4129         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4130           break;
4131       if (i < 128)
4132         /* Some ASCII character should be translated.  We give up
4133            shrinking.  */
4134         return;
4135     }
4136
4137   if (coding->heading_ascii >= 0)
4138     /* Detection routine has already found how much we can skip at the
4139        head.  */
4140     *beg += coding->heading_ascii;
4141
4142   if (str)
4143     {
4144       begp_orig = begp = str + *beg;
4145       endp_orig = endp = str + *end;
4146     }
4147   else
4148     {
4149       begp_orig = begp = BYTE_POS_ADDR (*beg);
4150       endp_orig = endp = begp + *end - *beg;
4151     }
4152
4153   eol_conversion = (coding->eol_type == CODING_EOL_CR
4154                     || coding->eol_type == CODING_EOL_CRLF);
4155
4156   switch (coding->type)
4157     {
4158     case coding_type_sjis:
4159     case coding_type_big5:
4160       /* We can skip all ASCII characters at the head.  */
4161       if (coding->heading_ascii < 0)
4162         {
4163           if (eol_conversion)
4164             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4165           else
4166             while (begp < endp && *begp < 0x80) begp++;
4167         }
4168       /* We can skip all ASCII characters at the tail except for the
4169          second byte of SJIS or BIG5 code.  */
4170       if (eol_conversion)
4171         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4172       else
4173         while (begp < endp && endp[-1] < 0x80) endp--;
4174       /* Do not consider LF as ascii if preceded by CR, since that
4175          confuses eol decoding. */
4176       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4177         endp++;
4178       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4179         endp++;
4180       break;
4181
4182     case coding_type_iso2022:
4183       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4184         /* We can't skip any data.  */
4185         break;
4186       if (coding->heading_ascii < 0)
4187         {
4188           /* We can skip all ASCII characters at the head except for a
4189              few control codes.  */
4190           while (begp < endp && (c = *begp) < 0x80
4191                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4192                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4193                  && (!eol_conversion || c != ISO_CODE_LF))
4194             begp++;
4195         }
4196       switch (coding->category_idx)
4197         {
4198         case CODING_CATEGORY_IDX_ISO_8_1:
4199         case CODING_CATEGORY_IDX_ISO_8_2:
4200           /* We can skip all ASCII characters at the tail.  */
4201           if (eol_conversion)
4202             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4203           else
4204             while (begp < endp && endp[-1] < 0x80) endp--;
4205           /* Do not consider LF as ascii if preceded by CR, since that
4206              confuses eol decoding. */
4207           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4208             endp++;
4209           break;
4210
4211         case CODING_CATEGORY_IDX_ISO_7:
4212         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4213           {
4214             /* We can skip all charactes at the tail except for 8-bit
4215                codes and ESC and the following 2-byte at the tail.  */
4216             unsigned char *eight_bit = NULL;
4217
4218             if (eol_conversion)
4219               while (begp < endp
4220                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4221                 {
4222                   if (!eight_bit && c & 0x80) eight_bit = endp;
4223                   endp--;
4224                 }
4225             else
4226               while (begp < endp
4227                      && (c = endp[-1]) != ISO_CODE_ESC)
4228                 {
4229                   if (!eight_bit && c & 0x80) eight_bit = endp;
4230                   endp--;
4231                 }
4232             /* Do not consider LF as ascii if preceded by CR, since that
4233                confuses eol decoding. */
4234             if (begp < endp && endp < endp_orig
4235                 && endp[-1] == '\r' && endp[0] == '\n')
4236               endp++;
4237             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4238               {
4239                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4240                   /* This is an ASCII designation sequence.  We can
4241                      surely skip the tail.  But, if we have
4242                      encountered an 8-bit code, skip only the codes
4243                      after that.  */
4244                   endp = eight_bit ? eight_bit : endp + 2;
4245                 else
4246                   /* Hmmm, we can't skip the tail.  */
4247                   endp = endp_orig;
4248               }
4249             else if (eight_bit)
4250               endp = eight_bit;
4251           }
4252         }
4253       break;
4254
4255     default:
4256       abort ();
4257     }
4258   *beg += begp - begp_orig;
4259   *end += endp - endp_orig;
4260   return;
4261 }
4262
4263 /* Like shrink_decoding_region but for encoding.  */
4264
4265 static void
4266 shrink_encoding_region (beg, end, coding, str)
4267      int *beg, *end;
4268      struct coding_system *coding;
4269      unsigned char *str;
4270 {
4271   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4272   int eol_conversion;
4273   Lisp_Object translation_table;
4274
4275   if (coding->type == coding_type_ccl
4276       || coding->eol_type == CODING_EOL_CRLF
4277       || coding->eol_type == CODING_EOL_CR
4278       || coding->cmp_data && coding->cmp_data->used > 0)
4279     {
4280       /* We can't skip any data.  */
4281       return;
4282     }
4283   if (coding->type == coding_type_no_conversion
4284       || coding->type == coding_type_raw_text
4285       || coding->type == coding_type_emacs_mule
4286       || coding->type == coding_type_undecided)
4287     {
4288       /* We need no conversion, but don't have to skip any data here.
4289          Encoding routine handles them effectively anyway.  */
4290       return;
4291     }
4292
4293   translation_table = coding->translation_table_for_encode;
4294   if (NILP (translation_table) && !NILP (Venable_character_translation))
4295     translation_table = Vstandard_translation_table_for_encode;
4296   if (CHAR_TABLE_P (translation_table))
4297     {
4298       int i;
4299       for (i = 0; i < 128; i++)
4300         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4301           break;
4302       if (i < 128)
4303         /* Some ASCII character should be tranlsated.  We give up
4304            shrinking.  */
4305         return;
4306     }
4307
4308   if (str)
4309     {
4310       begp_orig = begp = str + *beg;
4311       endp_orig = endp = str + *end;
4312     }
4313   else
4314     {
4315       begp_orig = begp = BYTE_POS_ADDR (*beg);
4316       endp_orig = endp = begp + *end - *beg;
4317     }
4318
4319   eol_conversion = (coding->eol_type == CODING_EOL_CR
4320                     || coding->eol_type == CODING_EOL_CRLF);
4321
4322   /* Here, we don't have to check coding->pre_write_conversion because
4323      the caller is expected to have handled it already.  */
4324   switch (coding->type)
4325     {
4326     case coding_type_iso2022:
4327       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4328         /* We can't skip any data.  */
4329         break;
4330       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4331         {
4332           unsigned char *bol = begp;
4333           while (begp < endp && *begp < 0x80)
4334             {
4335               begp++;
4336               if (begp[-1] == '\n')
4337                 bol = begp;
4338             }
4339           begp = bol;
4340           goto label_skip_tail;
4341         }
4342       /* fall down ... */
4343
4344     case coding_type_sjis:
4345     case coding_type_big5:
4346       /* We can skip all ASCII characters at the head and tail.  */
4347       if (eol_conversion)
4348         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4349       else
4350         while (begp < endp && *begp < 0x80) begp++;
4351     label_skip_tail:
4352       if (eol_conversion)
4353         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4354       else
4355         while (begp < endp && *(endp - 1) < 0x80) endp--;
4356       break;
4357
4358     default:
4359       abort ();
4360     }
4361
4362   *beg += begp - begp_orig;
4363   *end += endp - endp_orig;
4364   return;
4365 }
4366
4367 /* As shrinking conversion region requires some overhead, we don't try
4368    shrinking if the length of conversion region is less than this
4369    value.  */
4370 static int shrink_conversion_region_threshhold = 1024;
4371
4372 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4373   do {                                                                  \
4374     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4375       {                                                                 \
4376         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4377         else shrink_decoding_region (beg, end, coding, str);            \
4378       }                                                                 \
4379   } while (0)
4380
4381 static Lisp_Object
4382 code_convert_region_unwind (dummy)
4383      Lisp_Object dummy;
4384 {
4385   inhibit_pre_post_conversion = 0;
4386   return Qnil;
4387 }
4388
4389 /* Store information about all compositions in the range FROM and TO
4390    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4391    buffer or a string, defaults to the current buffer.  */
4392
4393 void
4394 coding_save_composition (coding, from, to, obj)
4395      struct coding_system *coding;
4396      int from, to;
4397      Lisp_Object obj;
4398 {
4399   Lisp_Object prop;
4400   int start, end;
4401
4402   if (coding->composing == COMPOSITION_DISABLED)
4403     return;
4404   if (!coding->cmp_data)
4405     coding_allocate_composition_data (coding, from);
4406   if (!find_composition (from, to, &start, &end, &prop, obj)
4407       || end > to)
4408     return;
4409   if (start < from
4410       && (!find_composition (end, to, &start, &end, &prop, obj)
4411           || end > to))
4412     return;
4413   coding->composing = COMPOSITION_NO;
4414   do
4415     {
4416       if (COMPOSITION_VALID_P (start, end, prop))
4417         {
4418           enum composition_method method = COMPOSITION_METHOD (prop);
4419           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4420               >= COMPOSITION_DATA_SIZE)
4421             coding_allocate_composition_data (coding, from);
4422           /* For relative composition, we remember start and end
4423              positions, for the other compositions, we also remember
4424              components.  */
4425           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4426           if (method != COMPOSITION_RELATIVE)
4427             {
4428               /* We must store a*/
4429               Lisp_Object val, ch;
4430
4431               val = COMPOSITION_COMPONENTS (prop);
4432               if (CONSP (val))
4433                 while (CONSP (val))
4434                   {
4435                     ch = XCAR (val), val = XCDR (val);
4436                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4437                   }
4438               else if (VECTORP (val) || STRINGP (val))
4439                 {
4440                   int len = (VECTORP (val)
4441                              ? XVECTOR (val)->size : XSTRING (val)->size);
4442                   int i;
4443                   for (i = 0; i < len; i++)
4444                     {
4445                       ch = (STRINGP (val)
4446                             ? Faref (val, make_number (i))
4447                             : XVECTOR (val)->contents[i]);
4448                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4449                     }
4450                 }
4451               else              /* INTEGERP (val) */
4452                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4453             }
4454           CODING_ADD_COMPOSITION_END (coding, end - from);
4455         }
4456       start = end;
4457     }
4458   while (start < to
4459          && find_composition (start, to, &start, &end, &prop, obj)
4460          && end <= to);
4461
4462   /* Make coding->cmp_data point to the first memory block.  */
4463   while (coding->cmp_data->prev)
4464     coding->cmp_data = coding->cmp_data->prev;
4465   coding->cmp_data_start = 0;
4466 }
4467
4468 /* Reflect the saved information about compositions to OBJ.
4469    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4470    is a buffer or a string, defaults to the current buffer.  */
4471
4472 void
4473 coding_restore_composition (coding, obj)
4474      struct coding_system *coding;
4475      Lisp_Object obj;
4476 {
4477   struct composition_data *cmp_data = coding->cmp_data;
4478
4479   if (!cmp_data)
4480     return;
4481
4482   while (cmp_data->prev)
4483     cmp_data = cmp_data->prev;
4484
4485   while (cmp_data)
4486     {
4487       int i;
4488
4489       for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4490         {
4491           int *data = cmp_data->data + i;
4492           enum composition_method method = (enum composition_method) data[3];
4493           Lisp_Object components;
4494
4495           if (method == COMPOSITION_RELATIVE)
4496             components = Qnil;
4497           else
4498             {
4499               int len = data[0] - 4, j;
4500               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4501
4502               for (j = 0; j < len; j++)
4503                 args[j] = make_number (data[4 + j]);
4504               components = (method == COMPOSITION_WITH_ALTCHARS
4505                             ? Fstring (len, args) : Fvector (len, args));
4506             }
4507           compose_text (data[1], data[2], components, Qnil, obj);
4508         }
4509       cmp_data = cmp_data->next;
4510     }
4511 }
4512
4513 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4514    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4515    coding system CODING, and return the status code of code conversion
4516    (currently, this value has no meaning).
4517
4518    How many characters (and bytes) are converted to how many
4519    characters (and bytes) are recorded in members of the structure
4520    CODING.
4521
4522    If REPLACE is nonzero, we do various things as if the original text
4523    is deleted and a new text is inserted.  See the comments in
4524    replace_range (insdel.c) to know what we are doing.
4525
4526    If REPLACE is zero, it is assumed that the source text is unibyte.
4527    Otherwize, it is assumed that the source text is multibyte.  */
4528
4529 int
4530 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4531      int from, from_byte, to, to_byte, encodep, replace;
4532      struct coding_system *coding;
4533 {
4534   int len = to - from, len_byte = to_byte - from_byte;
4535   int require, inserted, inserted_byte;
4536   int head_skip, tail_skip, total_skip = 0;
4537   Lisp_Object saved_coding_symbol;
4538   int first = 1;
4539   unsigned char *src, *dst;
4540   Lisp_Object deletion;
4541   int orig_point = PT, orig_len = len;
4542   int prev_Z;
4543   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4544
4545   coding->src_multibyte = replace && multibyte_p;
4546   coding->dst_multibyte = multibyte_p;
4547
4548   deletion = Qnil;
4549   saved_coding_symbol = Qnil;
4550
4551   if (from < PT && PT < to)
4552     {
4553       TEMP_SET_PT_BOTH (from, from_byte);
4554       orig_point = from;
4555     }
4556
4557   if (replace)
4558     {
4559       int saved_from = from;
4560
4561       prepare_to_modify_buffer (from, to, &from);
4562       if (saved_from != from)
4563         {
4564           to = from + len;
4565           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4566           len_byte = to_byte - from_byte;
4567         }
4568     }
4569
4570   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4571     {
4572       /* We must detect encoding of text and eol format.  */
4573
4574       if (from < GPT && to > GPT)
4575         move_gap_both (from, from_byte);
4576       if (coding->type == coding_type_undecided)
4577         {
4578           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4579           if (coding->type == coding_type_undecided)
4580             /* It seems that the text contains only ASCII, but we
4581                should not left it undecided because the deeper
4582                decoding routine (decode_coding) tries to detect the
4583                encodings again in vain.  */
4584             coding->type = coding_type_emacs_mule;
4585         }
4586       if (coding->eol_type == CODING_EOL_UNDECIDED)
4587         {
4588           saved_coding_symbol = coding->symbol;
4589           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4590           if (coding->eol_type == CODING_EOL_UNDECIDED)
4591             coding->eol_type = CODING_EOL_LF;
4592           /* We had better recover the original eol format if we
4593              encounter an inconsitent eol format while decoding.  */
4594           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4595         }
4596     }
4597
4598   /* Now we convert the text.  */
4599
4600   /* For encoding, we must process pre-write-conversion in advance.  */
4601   if (! inhibit_pre_post_conversion
4602       && encodep
4603       && SYMBOLP (coding->pre_write_conversion)
4604       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4605     {
4606       /* The function in pre-write-conversion may put a new text in a
4607          new buffer.  */
4608       struct buffer *prev = current_buffer;
4609       Lisp_Object new;
4610       int count = specpdl_ptr - specpdl;
4611
4612       record_unwind_protect (code_convert_region_unwind, Qnil);
4613       /* We should not call any more pre-write/post-read-conversion
4614          functions while this pre-write-conversion is running.  */
4615       inhibit_pre_post_conversion = 1;
4616       call2 (coding->pre_write_conversion,
4617              make_number (from), make_number (to));
4618       inhibit_pre_post_conversion = 0;
4619       /* Discard the unwind protect.  */
4620       specpdl_ptr--;
4621
4622       if (current_buffer != prev)
4623         {
4624           len = ZV - BEGV;
4625           new = Fcurrent_buffer ();
4626           set_buffer_internal_1 (prev);
4627           del_range_2 (from, from_byte, to, to_byte, 0);
4628           TEMP_SET_PT_BOTH (from, from_byte);
4629           insert_from_buffer (XBUFFER (new), 1, len, 0);
4630           Fkill_buffer (new);
4631           if (orig_point >= to)
4632             orig_point += len - orig_len;
4633           else if (orig_point > from)
4634             orig_point = from;
4635           orig_len = len;
4636           to = from + len;
4637           from_byte = CHAR_TO_BYTE (from);
4638           to_byte = CHAR_TO_BYTE (to);
4639           len_byte = to_byte - from_byte;
4640           TEMP_SET_PT_BOTH (from, from_byte);
4641         }
4642     }
4643
4644   if (replace)
4645     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4646
4647   if (coding->composing != COMPOSITION_DISABLED)
4648     {
4649       if (encodep)
4650         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4651       else
4652         coding_allocate_composition_data (coding, from);
4653     }
4654
4655   /* Try to skip the heading and tailing ASCIIs.  */
4656   {
4657     int from_byte_orig = from_byte, to_byte_orig = to_byte;
4658
4659     if (from < GPT && GPT < to)
4660       move_gap_both (from, from_byte);
4661     SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4662     if (from_byte == to_byte
4663         && (encodep || NILP (coding->post_read_conversion))
4664         && ! CODING_REQUIRE_FLUSHING (coding))
4665       {
4666         coding->produced = len_byte;
4667         coding->produced_char = len;
4668         if (!replace)
4669           /* We must record and adjust for this new text now.  */
4670           adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4671         return 0;
4672       }
4673
4674     head_skip = from_byte - from_byte_orig;
4675     tail_skip = to_byte_orig - to_byte;
4676     total_skip = head_skip + tail_skip;
4677     from += head_skip;
4678     to -= tail_skip;
4679     len -= total_skip; len_byte -= total_skip;
4680   }
4681
4682   /* The code conversion routine can not preserve text properties for
4683      now.  So, we must remove all text properties in the region.
4684      Here, we must suppress all modification hooks.  */
4685   if (replace)
4686     {
4687       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4688       inhibit_modification_hooks = 1;
4689       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4690       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4691     }
4692
4693   /* For converion, we must put the gap before the text in addition to
4694      making the gap larger for efficient decoding.  The required gap
4695      size starts from 2000 which is the magic number used in make_gap.
4696      But, after one batch of conversion, it will be incremented if we
4697      find that it is not enough .  */
4698   require = 2000;
4699
4700   if (GAP_SIZE  < require)
4701     make_gap (require - GAP_SIZE);
4702   move_gap_both (from, from_byte);
4703
4704   inserted = inserted_byte = 0;
4705
4706   GAP_SIZE += len_byte;
4707   ZV -= len;
4708   Z -= len;
4709   ZV_BYTE -= len_byte;
4710   Z_BYTE -= len_byte;
4711
4712   if (GPT - BEG < BEG_UNCHANGED)
4713     BEG_UNCHANGED = GPT - BEG;
4714   if (Z - GPT < END_UNCHANGED)
4715     END_UNCHANGED = Z - GPT;
4716
4717   if (!encodep && coding->src_multibyte)
4718     {
4719       /* Decoding routines expects that the source text is unibyte.
4720          We must convert 8-bit characters of multibyte form to
4721          unibyte.  */
4722       int len_byte_orig = len_byte;
4723       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4724       if (len_byte < len_byte_orig)
4725         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4726                     len_byte);
4727       coding->src_multibyte = 0;
4728     }
4729
4730   for (;;)
4731     {
4732       int result;
4733
4734       /* The buffer memory is now:
4735          +--------+converted-text+---------+-------original-text-------+---+
4736          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4737                   |<---------------------- GAP ----------------------->|  */
4738       src = GAP_END_ADDR - len_byte;
4739       dst = GPT_ADDR + inserted_byte;
4740
4741       if (encodep)
4742         result = encode_coding (coding, src, dst, len_byte, 0);
4743       else
4744         result = decode_coding (coding, src, dst, len_byte, 0);
4745
4746       /* The buffer memory is now:
4747          +--------+-------converted-text----+--+------original-text----+---+
4748          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4749                   |<---------------------- GAP ----------------------->|  */
4750
4751       inserted += coding->produced_char;
4752       inserted_byte += coding->produced;
4753       len_byte -= coding->consumed;
4754
4755       if (result == CODING_FINISH_INSUFFICIENT_CMP)
4756         {
4757           coding_allocate_composition_data (coding, from + inserted);
4758           continue;
4759         }
4760
4761       src += coding->consumed;
4762       dst += coding->produced;
4763
4764       if (result == CODING_FINISH_NORMAL)
4765         {
4766           src += len_byte;
4767           break;
4768         }
4769       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4770         {
4771           unsigned char *pend = dst, *p = pend - inserted_byte;
4772           Lisp_Object eol_type;
4773
4774           /* Encode LFs back to the original eol format (CR or CRLF).  */
4775           if (coding->eol_type == CODING_EOL_CR)
4776             {
4777               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4778             }
4779           else
4780             {
4781               int count = 0;
4782
4783               while (p < pend) if (*p++ == '\n') count++;
4784               if (src - dst < count)
4785                 {
4786                   /* We don't have sufficient room for encoding LFs
4787                      back to CRLF.  We must record converted and
4788                      not-yet-converted text back to the buffer
4789                      content, enlarge the gap, then record them out of
4790                      the buffer contents again.  */
4791                   int add = len_byte + inserted_byte;
4792
4793                   GAP_SIZE -= add;
4794                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4795                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4796                   make_gap (count - GAP_SIZE);
4797                   GAP_SIZE += add;
4798                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4799                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4800                   /* Don't forget to update SRC, DST, and PEND.  */
4801                   src = GAP_END_ADDR - len_byte;
4802                   dst = GPT_ADDR + inserted_byte;
4803                   pend = dst;
4804                 }
4805               inserted += count;
4806               inserted_byte += count;
4807               coding->produced += count;
4808               p = dst = pend + count;
4809               while (count)
4810                 {
4811                   *--p = *--pend;
4812                   if (*p == '\n') count--, *--p = '\r';
4813                 }
4814             }
4815
4816           /* Suppress eol-format conversion in the further conversion.  */
4817           coding->eol_type = CODING_EOL_LF;
4818
4819           /* Set the coding system symbol to that for Unix-like EOL.  */
4820           eol_type = Fget (saved_coding_symbol, Qeol_type);
4821           if (VECTORP (eol_type)
4822               && XVECTOR (eol_type)->size == 3
4823               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4824             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4825           else
4826             coding->symbol = saved_coding_symbol;
4827
4828           continue;
4829         }
4830       if (len_byte <= 0)
4831         {
4832           if (coding->type != coding_type_ccl
4833               || coding->mode & CODING_MODE_LAST_BLOCK)
4834             break;
4835           coding->mode |= CODING_MODE_LAST_BLOCK;
4836           continue;
4837         }
4838       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4839         {
4840           /* The source text ends in invalid codes.  Let's just
4841              make them valid buffer contents, and finish conversion.  */
4842           inserted += len_byte;
4843           inserted_byte += len_byte;
4844           while (len_byte--)
4845             *dst++ = *src++;
4846           break;
4847         }
4848       if (result == CODING_FINISH_INTERRUPT)
4849         {
4850           /* The conversion procedure was interrupted by a user.  */
4851           break;
4852         }
4853       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
4854       if (coding->consumed < 1)
4855         {
4856           /* It's quite strange to require more memory without
4857              consuming any bytes.  Perhaps CCL program bug.  */
4858           break;
4859         }
4860       if (first)
4861         {
4862           /* We have just done the first batch of conversion which was
4863              stoped because of insufficient gap.  Let's reconsider the
4864              required gap size (i.e. SRT - DST) now.
4865
4866              We have converted ORIG bytes (== coding->consumed) into
4867              NEW bytes (coding->produced).  To convert the remaining
4868              LEN bytes, we may need REQUIRE bytes of gap, where:
4869                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4870                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4871              Here, we are sure that NEW >= ORIG.  */
4872           float ratio = coding->produced - coding->consumed;
4873           ratio /= coding->consumed;
4874           require = len_byte * ratio;
4875           first = 0;
4876         }
4877       if ((src - dst) < (require + 2000))
4878         {
4879           /* See the comment above the previous call of make_gap.  */
4880           int add = len_byte + inserted_byte;
4881
4882           GAP_SIZE -= add;
4883           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4884           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4885           make_gap (require + 2000);
4886           GAP_SIZE += add;
4887           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4888           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4889         }
4890     }
4891   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
4892
4893   if (encodep && coding->dst_multibyte)
4894     {
4895       /* The output is unibyte.  We must convert 8-bit characters to
4896          multibyte form.  */
4897       if (inserted_byte * 2 > GAP_SIZE)
4898         {
4899           GAP_SIZE -= inserted_byte;
4900           ZV += inserted_byte; Z += inserted_byte;
4901           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4902           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4903           make_gap (inserted_byte - GAP_SIZE);
4904           GAP_SIZE += inserted_byte;
4905           ZV -= inserted_byte; Z -= inserted_byte;
4906           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
4907           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4908         }
4909       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
4910     }
4911
4912   /* If we have shrinked the conversion area, adjust it now.  */
4913   if (total_skip > 0)
4914     {
4915       if (tail_skip > 0)
4916         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4917       inserted += total_skip; inserted_byte += total_skip;
4918       GAP_SIZE += total_skip;
4919       GPT -= head_skip; GPT_BYTE -= head_skip;
4920       ZV -= total_skip; ZV_BYTE -= total_skip;
4921       Z -= total_skip; Z_BYTE -= total_skip;
4922       from -= head_skip; from_byte -= head_skip;
4923       to += tail_skip; to_byte += tail_skip;
4924     }
4925
4926   prev_Z = Z;
4927   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4928   inserted = Z - prev_Z;
4929
4930   if (!encodep && coding->cmp_data && coding->cmp_data->used)
4931     coding_restore_composition (coding, Fcurrent_buffer ());
4932   coding_free_composition_data (coding);
4933
4934   if (! inhibit_pre_post_conversion
4935       && ! encodep && ! NILP (coding->post_read_conversion))
4936     {
4937       Lisp_Object val;
4938       int count = specpdl_ptr - specpdl;
4939
4940       if (from != PT)
4941         TEMP_SET_PT_BOTH (from, from_byte);
4942       prev_Z = Z;
4943       record_unwind_protect (code_convert_region_unwind, Qnil);
4944       /* We should not call any more pre-write/post-read-conversion
4945          functions while this post-read-conversion is running.  */
4946       inhibit_pre_post_conversion = 1;
4947       val = call1 (coding->post_read_conversion, make_number (inserted));
4948       inhibit_pre_post_conversion = 0;
4949       /* Discard the unwind protect.  */
4950       specpdl_ptr--;
4951       CHECK_NUMBER (val, 0);
4952       inserted += Z - prev_Z;
4953     }
4954
4955   if (orig_point >= from)
4956     {
4957       if (orig_point >= from + orig_len)
4958         orig_point += inserted - orig_len;
4959       else
4960         orig_point = from;
4961       TEMP_SET_PT (orig_point);
4962     }
4963
4964   if (replace)
4965     {
4966       signal_after_change (from, to - from, inserted);
4967       update_compositions (from, from + inserted, CHECK_BORDER);
4968     }
4969
4970   {
4971     coding->consumed = to_byte - from_byte;
4972     coding->consumed_char = to - from;
4973     coding->produced = inserted_byte;
4974     coding->produced_char = inserted;
4975   }
4976
4977   return 0;
4978 }
4979
4980 Lisp_Object
4981 run_pre_post_conversion_on_str (str, coding, encodep)
4982      Lisp_Object str;
4983      struct coding_system *coding;
4984      int encodep;
4985 {
4986   int count = specpdl_ptr - specpdl;
4987   struct gcpro gcpro1;
4988   struct buffer *prev = current_buffer;
4989   int multibyte = STRING_MULTIBYTE (str);
4990
4991   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4992   record_unwind_protect (code_convert_region_unwind, Qnil);
4993   GCPRO1 (str);
4994   temp_output_buffer_setup (" *code-converting-work*");
4995   set_buffer_internal (XBUFFER (Vstandard_output));
4996   /* We must insert the contents of STR as is without
4997      unibyte<->multibyte conversion.  For that, we adjust the
4998      multibyteness of the working buffer to that of STR.  */
4999   Ferase_buffer ();
5000   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5001   insert_from_string (str, 0, 0,
5002                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5003   UNGCPRO;
5004   inhibit_pre_post_conversion = 1;
5005   if (encodep)
5006     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5007   else
5008     {
5009       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5010       call1 (coding->post_read_conversion, make_number (Z - BEG));
5011     }
5012   inhibit_pre_post_conversion = 0;
5013   str = make_buffer_string (BEG, Z, 0);
5014   return unbind_to (count, str);
5015 }
5016
5017 Lisp_Object
5018 decode_coding_string (str, coding, nocopy)
5019      Lisp_Object str;
5020      struct coding_system *coding;
5021      int nocopy;
5022 {
5023   int len;
5024   char *buf;
5025   int from, to, to_byte;
5026   struct gcpro gcpro1;
5027   Lisp_Object saved_coding_symbol;
5028   int result;
5029
5030   from = 0;
5031   to = XSTRING (str)->size;
5032   to_byte = STRING_BYTES (XSTRING (str));
5033
5034   saved_coding_symbol = Qnil;
5035   if (CODING_REQUIRE_DETECTION (coding))
5036     {
5037       /* See the comments in code_convert_region.  */
5038       if (coding->type == coding_type_undecided)
5039         {
5040           detect_coding (coding, XSTRING (str)->data, to_byte);
5041           if (coding->type == coding_type_undecided)
5042             coding->type = coding_type_emacs_mule;
5043         }
5044       if (coding->eol_type == CODING_EOL_UNDECIDED)
5045         {
5046           saved_coding_symbol = coding->symbol;
5047           detect_eol (coding, XSTRING (str)->data, to_byte);
5048           if (coding->eol_type == CODING_EOL_UNDECIDED)
5049             coding->eol_type = CODING_EOL_LF;
5050           /* We had better recover the original eol format if we
5051              encounter an inconsitent eol format while decoding.  */
5052           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5053         }
5054     }
5055
5056   if (! CODING_REQUIRE_DECODING (coding))
5057     {
5058       if (!STRING_MULTIBYTE (str))
5059         {
5060           str = Fstring_as_multibyte (str);
5061           nocopy = 1;
5062         }
5063       return (nocopy ? str : Fcopy_sequence (str));
5064     }
5065
5066   if (STRING_MULTIBYTE (str))
5067     {
5068       /* Decoding routines expect the source text to be unibyte.  */
5069       str = Fstring_as_unibyte (str);
5070       nocopy = 1;
5071       coding->src_multibyte = 0;
5072     }
5073   coding->dst_multibyte = 1;
5074
5075   if (coding->composing != COMPOSITION_DISABLED)
5076     coding_allocate_composition_data (coding, from);
5077
5078   /* Try to skip the heading and tailing ASCIIs.  */
5079   {
5080     int from_orig = from;
5081
5082     SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5083                               0);
5084     if (from == to_byte)
5085       return (nocopy ? str : Fcopy_sequence (str));
5086   }
5087
5088   len = decoding_buffer_size (coding, to_byte - from);
5089   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5090   GCPRO1 (str);
5091   buf = get_conversion_buffer (len);
5092   UNGCPRO;
5093
5094   if (from > 0)
5095     bcopy (XSTRING (str)->data, buf, from);
5096   result = decode_coding (coding, XSTRING (str)->data + from,
5097                          buf + from, to_byte - from, len);
5098   if (result == CODING_FINISH_INCONSISTENT_EOL)
5099     {
5100       /* We simply try to decode the whole string again but without
5101          eol-conversion this time.  */
5102       coding->eol_type = CODING_EOL_LF;
5103       coding->symbol = saved_coding_symbol;
5104       coding_free_composition_data (coding);
5105       return decode_coding_string (str, coding, nocopy);
5106     }
5107
5108   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5109          STRING_BYTES (XSTRING (str)) - to_byte);
5110
5111   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5112   str = make_multibyte_string (buf, len + coding->produced_char,
5113                                len + coding->produced);
5114
5115   if (coding->cmp_data && coding->cmp_data->used)
5116     coding_restore_composition (coding, str);
5117   coding_free_composition_data (coding);
5118
5119   if (SYMBOLP (coding->post_read_conversion)
5120       && !NILP (Ffboundp (coding->post_read_conversion)))
5121     str = run_pre_post_conversion_on_str (str, coding, 0);
5122
5123   return str;
5124 }
5125
5126 Lisp_Object
5127 encode_coding_string (str, coding, nocopy)
5128      Lisp_Object str;
5129      struct coding_system *coding;
5130      int nocopy;
5131 {
5132   int len;
5133   char *buf;
5134   int from, to, to_byte;
5135   struct gcpro gcpro1;
5136   Lisp_Object saved_coding_symbol;
5137   int result;
5138
5139   if (SYMBOLP (coding->pre_write_conversion)
5140       && !NILP (Ffboundp (coding->pre_write_conversion)))
5141     str = run_pre_post_conversion_on_str (str, coding, 1);
5142
5143   from = 0;
5144   to = XSTRING (str)->size;
5145   to_byte = STRING_BYTES (XSTRING (str));
5146
5147   saved_coding_symbol = Qnil;
5148   if (! CODING_REQUIRE_ENCODING (coding))
5149     {
5150       if (STRING_MULTIBYTE (str))
5151         {
5152           str = Fstring_as_unibyte (str);
5153           nocopy = 1;
5154         }
5155       return (nocopy ? str : Fcopy_sequence (str));
5156     }
5157
5158   /* Encoding routines determine the multibyteness of the source text
5159      by coding->src_multibyte.  */
5160   coding->src_multibyte = STRING_MULTIBYTE (str);
5161   coding->dst_multibyte = 0;
5162
5163   if (coding->composing != COMPOSITION_DISABLED)
5164     coding_save_composition (coding, from, to, str);
5165
5166   /* Try to skip the heading and tailing ASCIIs.  */
5167   {
5168     int from_orig = from;
5169
5170     SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5171                               1);
5172     if (from == to_byte)
5173       return (nocopy ? str : Fcopy_sequence (str));
5174   }
5175
5176   len = encoding_buffer_size (coding, to_byte - from);
5177   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5178   GCPRO1 (str);
5179   buf = get_conversion_buffer (len);
5180   UNGCPRO;
5181
5182   if (from > 0)
5183     bcopy (XSTRING (str)->data, buf, from);
5184   result = encode_coding (coding, XSTRING (str)->data + from,
5185                           buf + from, to_byte - from, len);
5186   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5187          STRING_BYTES (XSTRING (str)) - to_byte);
5188
5189   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5190   str = make_unibyte_string (buf, len + coding->produced);
5191   coding_free_composition_data (coding);
5192
5193   return str;
5194 }
5195
5196 \f
5197 #ifdef emacs
5198 /*** 8. Emacs Lisp library functions ***/
5199
5200 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5201   "Return t if OBJECT is nil or a coding-system.\n\
5202 See the documentation of `make-coding-system' for information\n\
5203 about coding-system objects.")
5204   (obj)
5205      Lisp_Object obj;
5206 {
5207   if (NILP (obj))
5208     return Qt;
5209   if (!SYMBOLP (obj))
5210     return Qnil;
5211   /* Get coding-spec vector for OBJ.  */
5212   obj = Fget (obj, Qcoding_system);
5213   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5214           ? Qt : Qnil);
5215 }
5216
5217 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5218        Sread_non_nil_coding_system, 1, 1, 0,
5219   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5220   (prompt)
5221      Lisp_Object prompt;
5222 {
5223   Lisp_Object val;
5224   do
5225     {
5226       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5227                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5228     }
5229   while (XSTRING (val)->size == 0);
5230   return (Fintern (val, Qnil));
5231 }
5232
5233 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5234   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5235 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5236   (prompt, default_coding_system)
5237      Lisp_Object prompt, default_coding_system;
5238 {
5239   Lisp_Object val;
5240   if (SYMBOLP (default_coding_system))
5241     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5242   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5243                           Qt, Qnil, Qcoding_system_history,
5244                           default_coding_system, Qnil);
5245   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5246 }
5247
5248 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5249        1, 1, 0,
5250   "Check validity of CODING-SYSTEM.\n\
5251 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5252 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5253 The value of property should be a vector of length 5.")
5254   (coding_system)
5255      Lisp_Object coding_system;
5256 {
5257   CHECK_SYMBOL (coding_system, 0);
5258   if (!NILP (Fcoding_system_p (coding_system)))
5259     return coding_system;
5260   while (1)
5261     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5262 }
5263 \f
5264 Lisp_Object
5265 detect_coding_system (src, src_bytes, highest)
5266      unsigned char *src;
5267      int src_bytes, highest;
5268 {
5269   int coding_mask, eol_type;
5270   Lisp_Object val, tmp;
5271   int dummy;
5272
5273   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5274   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5275   if (eol_type == CODING_EOL_INCONSISTENT)
5276     eol_type = CODING_EOL_UNDECIDED;
5277
5278   if (!coding_mask)
5279     {
5280       val = Qundecided;
5281       if (eol_type != CODING_EOL_UNDECIDED)
5282         {
5283           Lisp_Object val2;
5284           val2 = Fget (Qundecided, Qeol_type);
5285           if (VECTORP (val2))
5286             val = XVECTOR (val2)->contents[eol_type];
5287         }
5288       return (highest ? val : Fcons (val, Qnil));
5289     }
5290
5291   /* At first, gather possible coding systems in VAL.  */
5292   val = Qnil;
5293   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5294     {
5295       Lisp_Object category_val, category_index;
5296
5297       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5298       category_val = Fsymbol_value (XCAR (tmp));
5299       if (!NILP (category_val)
5300           && NATNUMP (category_index)
5301           && (coding_mask & (1 << XFASTINT (category_index))))
5302         {
5303           val = Fcons (category_val, val);
5304           if (highest)
5305             break;
5306         }
5307     }
5308   if (!highest)
5309     val = Fnreverse (val);
5310
5311   /* Then, replace the elements with subsidiary coding systems.  */
5312   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5313     {
5314       if (eol_type != CODING_EOL_UNDECIDED
5315           && eol_type != CODING_EOL_INCONSISTENT)
5316         {
5317           Lisp_Object eol;
5318           eol = Fget (XCAR (tmp), Qeol_type);
5319           if (VECTORP (eol))
5320             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5321         }
5322     }
5323   return (highest ? XCAR (val) : val);
5324 }
5325
5326 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5327        2, 3, 0,
5328   "Detect coding system of the text in the region between START and END.\n\
5329 Return a list of possible coding systems ordered by priority.\n\
5330 \n\
5331 If only ASCII characters are found, it returns a list of single element\n\
5332 `undecided' or its subsidiary coding system according to a detected\n\
5333 end-of-line format.\n\
5334 \n\
5335 If optional argument HIGHEST is non-nil, return the coding system of\n\
5336 highest priority.")
5337   (start, end, highest)
5338      Lisp_Object start, end, highest;
5339 {
5340   int from, to;
5341   int from_byte, to_byte;
5342
5343   CHECK_NUMBER_COERCE_MARKER (start, 0);
5344   CHECK_NUMBER_COERCE_MARKER (end, 1);
5345
5346   validate_region (&start, &end);
5347   from = XINT (start), to = XINT (end);
5348   from_byte = CHAR_TO_BYTE (from);
5349   to_byte = CHAR_TO_BYTE (to);
5350
5351   if (from < GPT && to >= GPT)
5352     move_gap_both (to, to_byte);
5353
5354   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5355                                to_byte - from_byte,
5356                                !NILP (highest));
5357 }
5358
5359 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5360        1, 2, 0,
5361   "Detect coding system of the text in STRING.\n\
5362 Return a list of possible coding systems ordered by priority.\n\
5363 \n\
5364 If only ASCII characters are found, it returns a list of single element\n\
5365 `undecided' or its subsidiary coding system according to a detected\n\
5366 end-of-line format.\n\
5367 \n\
5368 If optional argument HIGHEST is non-nil, return the coding system of\n\
5369 highest priority.")
5370   (string, highest)
5371      Lisp_Object string, highest;
5372 {
5373   CHECK_STRING (string, 0);
5374
5375   return detect_coding_system (XSTRING (string)->data,
5376                                STRING_BYTES (XSTRING (string)),
5377                                !NILP (highest));
5378 }
5379
5380 Lisp_Object
5381 code_convert_region1 (start, end, coding_system, encodep)
5382      Lisp_Object start, end, coding_system;
5383      int encodep;
5384 {
5385   struct coding_system coding;
5386   int from, to, len;
5387
5388   CHECK_NUMBER_COERCE_MARKER (start, 0);
5389   CHECK_NUMBER_COERCE_MARKER (end, 1);
5390   CHECK_SYMBOL (coding_system, 2);
5391
5392   validate_region (&start, &end);
5393   from = XFASTINT (start);
5394   to = XFASTINT (end);
5395
5396   if (NILP (coding_system))
5397     return make_number (to - from);
5398
5399   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5400     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5401
5402   coding.mode |= CODING_MODE_LAST_BLOCK;
5403   coding.src_multibyte = coding.dst_multibyte
5404     = !NILP (current_buffer->enable_multibyte_characters);
5405   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5406                        &coding, encodep, 1);
5407   Vlast_coding_system_used = coding.symbol;
5408   return make_number (coding.produced_char);
5409 }
5410
5411 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5412        3, 3, "r\nzCoding system: ",
5413   "Decode the current region by specified coding system.\n\
5414 When called from a program, takes three arguments:\n\
5415 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5416 This function sets `last-coding-system-used' to the precise coding system\n\
5417 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5418 not fully specified.)\n\
5419 It returns the length of the decoded text.")
5420   (start, end, coding_system)
5421      Lisp_Object start, end, coding_system;
5422 {
5423   return code_convert_region1 (start, end, coding_system, 0);
5424 }
5425
5426 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5427        3, 3, "r\nzCoding system: ",
5428   "Encode the current region by specified coding system.\n\
5429 When called from a program, takes three arguments:\n\
5430 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5431 This function sets `last-coding-system-used' to the precise coding system\n\
5432 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5433 not fully specified.)\n\
5434 It returns the length of the encoded text.")
5435   (start, end, coding_system)
5436      Lisp_Object start, end, coding_system;
5437 {
5438   return code_convert_region1 (start, end, coding_system, 1);
5439 }
5440
5441 Lisp_Object
5442 code_convert_string1 (string, coding_system, nocopy, encodep)
5443      Lisp_Object string, coding_system, nocopy;
5444      int encodep;
5445 {
5446   struct coding_system coding;
5447
5448   CHECK_STRING (string, 0);
5449   CHECK_SYMBOL (coding_system, 1);
5450
5451   if (NILP (coding_system))
5452     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5453
5454   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5455     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5456
5457   coding.mode |= CODING_MODE_LAST_BLOCK;
5458   string = (encodep
5459             ? encode_coding_string (string, &coding, !NILP (nocopy))
5460             : decode_coding_string (string, &coding, !NILP (nocopy)));
5461   Vlast_coding_system_used = coding.symbol;
5462
5463   return string;
5464 }
5465
5466 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5467        2, 3, 0,
5468   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5469 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5470 if the decoding operation is trivial.\n\
5471 This function sets `last-coding-system-used' to the precise coding system\n\
5472 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5473 not fully specified.)")
5474   (string, coding_system, nocopy)
5475      Lisp_Object string, coding_system, nocopy;
5476 {
5477   return code_convert_string1 (string, coding_system, nocopy, 0);
5478 }
5479
5480 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5481        2, 3, 0,
5482   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5483 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5484 if the encoding operation is trivial.\n\
5485 This function sets `last-coding-system-used' to the precise coding system\n\
5486 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5487 not fully specified.)")
5488   (string, coding_system, nocopy)
5489      Lisp_Object string, coding_system, nocopy;
5490 {
5491   return code_convert_string1 (string, coding_system, nocopy, 1);
5492 }
5493
5494 /* Encode or decode STRING according to CODING_SYSTEM.
5495    Do not set Vlast_coding_system_used.
5496
5497    This function is called only from macros DECODE_FILE and
5498    ENCODE_FILE, thus we ignore character composition.  */
5499
5500 Lisp_Object
5501 code_convert_string_norecord (string, coding_system, encodep)
5502      Lisp_Object string, coding_system;
5503      int encodep;
5504 {
5505   struct coding_system coding;
5506
5507   CHECK_STRING (string, 0);
5508   CHECK_SYMBOL (coding_system, 1);
5509
5510   if (NILP (coding_system))
5511     return string;
5512
5513   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5514     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5515
5516   coding.composing = COMPOSITION_DISABLED;
5517   coding.mode |= CODING_MODE_LAST_BLOCK;
5518   return (encodep
5519           ? encode_coding_string (string, &coding, 1)
5520           : decode_coding_string (string, &coding, 1));
5521 }
5522 \f
5523 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5524   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5525 Return the corresponding character.")
5526   (code)
5527      Lisp_Object code;
5528 {
5529   unsigned char c1, c2, s1, s2;
5530   Lisp_Object val;
5531
5532   CHECK_NUMBER (code, 0);
5533   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5534   if (s1 == 0)
5535     {
5536       if (s2 < 0x80)
5537         XSETFASTINT (val, s2);
5538       else if (s2 >= 0xA0 || s2 <= 0xDF)
5539         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5540       else
5541         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5542     }
5543   else
5544     {
5545       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5546           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5547         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5548       DECODE_SJIS (s1, s2, c1, c2);
5549       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5550     }
5551   return val;
5552 }
5553
5554 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5555   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5556 Return the corresponding code in SJIS.")
5557   (ch)
5558      Lisp_Object ch;
5559 {
5560   int charset, c1, c2, s1, s2;
5561   Lisp_Object val;
5562
5563   CHECK_NUMBER (ch, 0);
5564   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5565   if (charset == CHARSET_ASCII)
5566     {
5567       val = ch;
5568     }
5569   else if (charset == charset_jisx0208
5570            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5571     {
5572       ENCODE_SJIS (c1, c2, s1, s2);
5573       XSETFASTINT (val, (s1 << 8) | s2);
5574     }
5575   else if (charset == charset_katakana_jisx0201
5576            && c1 > 0x20 && c2 < 0xE0)
5577     {
5578       XSETFASTINT (val, c1 | 0x80);
5579     }
5580   else
5581     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5582   return val;
5583 }
5584
5585 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5586   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5587 Return the corresponding character.")
5588   (code)
5589      Lisp_Object code;
5590 {
5591   int charset;
5592   unsigned char b1, b2, c1, c2;
5593   Lisp_Object val;
5594
5595   CHECK_NUMBER (code, 0);
5596   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5597   if (b1 == 0)
5598     {
5599       if (b2 >= 0x80)
5600         error ("Invalid BIG5 code: %x", XFASTINT (code));
5601       val = code;
5602     }
5603   else
5604     {
5605       if ((b1 < 0xA1 || b1 > 0xFE)
5606           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5607         error ("Invalid BIG5 code: %x", XFASTINT (code));
5608       DECODE_BIG5 (b1, b2, charset, c1, c2);
5609       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5610     }
5611   return val;
5612 }
5613
5614 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5615   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5616 Return the corresponding character code in Big5.")
5617   (ch)
5618      Lisp_Object ch;
5619 {
5620   int charset, c1, c2, b1, b2;
5621   Lisp_Object val;
5622
5623   CHECK_NUMBER (ch, 0);
5624   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5625   if (charset == CHARSET_ASCII)
5626     {
5627       val = ch;
5628     }
5629   else if ((charset == charset_big5_1
5630             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5631            || (charset == charset_big5_2
5632                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5633     {
5634       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5635       XSETFASTINT (val, (b1 << 8) | b2);
5636     }
5637   else
5638     error ("Can't encode to Big5: %d", XFASTINT (ch));
5639   return val;
5640 }
5641 \f
5642 DEFUN ("set-terminal-coding-system-internal",
5643        Fset_terminal_coding_system_internal,
5644        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5645   (coding_system)
5646      Lisp_Object coding_system;
5647 {
5648   CHECK_SYMBOL (coding_system, 0);
5649   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5650   /* We had better not send unsafe characters to terminal.  */
5651   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5652   /* Characer composition should be disabled.  */
5653   terminal_coding.composing = COMPOSITION_DISABLED;
5654   terminal_coding.src_multibyte = 1;
5655   terminal_coding.dst_multibyte = 0;
5656   return Qnil;
5657 }
5658
5659 DEFUN ("set-safe-terminal-coding-system-internal",
5660        Fset_safe_terminal_coding_system_internal,
5661        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5662   (coding_system)
5663      Lisp_Object coding_system;
5664 {
5665   CHECK_SYMBOL (coding_system, 0);
5666   setup_coding_system (Fcheck_coding_system (coding_system),
5667                        &safe_terminal_coding);
5668   /* Characer composition should be disabled.  */
5669   safe_terminal_coding.composing = COMPOSITION_DISABLED;
5670   safe_terminal_coding.src_multibyte = 1;
5671   safe_terminal_coding.dst_multibyte = 0;
5672   return Qnil;
5673 }
5674
5675 DEFUN ("terminal-coding-system",
5676        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5677   "Return coding system specified for terminal output.")
5678   ()
5679 {
5680   return terminal_coding.symbol;
5681 }
5682
5683 DEFUN ("set-keyboard-coding-system-internal",
5684        Fset_keyboard_coding_system_internal,
5685        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5686   (coding_system)
5687      Lisp_Object coding_system;
5688 {
5689   CHECK_SYMBOL (coding_system, 0);
5690   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5691   /* Characer composition should be disabled.  */
5692   keyboard_coding.composing = COMPOSITION_DISABLED;
5693   return Qnil;
5694 }
5695
5696 DEFUN ("keyboard-coding-system",
5697        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5698   "Return coding system specified for decoding keyboard input.")
5699   ()
5700 {
5701   return keyboard_coding.symbol;
5702 }
5703
5704 \f
5705 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5706        Sfind_operation_coding_system,  1, MANY, 0,
5707   "Choose a coding system for an operation based on the target name.\n\
5708 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5709 DECODING-SYSTEM is the coding system to use for decoding\n\
5710 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5711 for encoding (in case OPERATION does encoding).\n\
5712 \n\
5713 The first argument OPERATION specifies an I/O primitive:\n\
5714   For file I/O, `insert-file-contents' or `write-region'.\n\
5715   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5716   For network I/O, `open-network-stream'.\n\
5717 \n\
5718 The remaining arguments should be the same arguments that were passed\n\
5719 to the primitive.  Depending on which primitive, one of those arguments\n\
5720 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5721 whichever argument specifies the file name is TARGET.\n\
5722 \n\
5723 TARGET has a meaning which depends on OPERATION:\n\
5724   For file I/O, TARGET is a file name.\n\
5725   For process I/O, TARGET is a process name.\n\
5726   For network I/O, TARGET is a service name or a port number\n\
5727 \n\
5728 This function looks up what specified for TARGET in,\n\
5729 `file-coding-system-alist', `process-coding-system-alist',\n\
5730 or `network-coding-system-alist' depending on OPERATION.\n\
5731 They may specify a coding system, a cons of coding systems,\n\
5732 or a function symbol to call.\n\
5733 In the last case, we call the function with one argument,\n\
5734 which is a list of all the arguments given to this function.")
5735   (nargs, args)
5736      int nargs;
5737      Lisp_Object *args;
5738 {
5739   Lisp_Object operation, target_idx, target, val;
5740   register Lisp_Object chain;
5741
5742   if (nargs < 2)
5743     error ("Too few arguments");
5744   operation = args[0];
5745   if (!SYMBOLP (operation)
5746       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5747     error ("Invalid first arguement");
5748   if (nargs < 1 + XINT (target_idx))
5749     error ("Too few arguments for operation: %s",
5750            XSYMBOL (operation)->name->data);
5751   target = args[XINT (target_idx) + 1];
5752   if (!(STRINGP (target)
5753         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5754     error ("Invalid %dth argument", XINT (target_idx) + 1);
5755
5756   chain = ((EQ (operation, Qinsert_file_contents)
5757             || EQ (operation, Qwrite_region))
5758            ? Vfile_coding_system_alist
5759            : (EQ (operation, Qopen_network_stream)
5760               ? Vnetwork_coding_system_alist
5761               : Vprocess_coding_system_alist));
5762   if (NILP (chain))
5763     return Qnil;
5764
5765   for (; CONSP (chain); chain = XCDR (chain))
5766     {
5767       Lisp_Object elt;
5768       elt = XCAR (chain);
5769
5770       if (CONSP (elt)
5771           && ((STRINGP (target)
5772                && STRINGP (XCAR (elt))
5773                && fast_string_match (XCAR (elt), target) >= 0)
5774               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5775         {
5776           val = XCDR (elt);
5777           /* Here, if VAL is both a valid coding system and a valid
5778              function symbol, we return VAL as a coding system.  */
5779           if (CONSP (val))
5780             return val;
5781           if (! SYMBOLP (val))
5782             return Qnil;
5783           if (! NILP (Fcoding_system_p (val)))
5784             return Fcons (val, val);
5785           if (! NILP (Ffboundp (val)))
5786             {
5787               val = call1 (val, Flist (nargs, args));
5788               if (CONSP (val))
5789                 return val;
5790               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5791                 return Fcons (val, val);
5792             }
5793           return Qnil;
5794         }
5795     }
5796   return Qnil;
5797 }
5798
5799 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5800        Supdate_coding_systems_internal, 0, 0, 0,
5801   "Update internal database for ISO2022 and CCL based coding systems.\n\
5802 When values of any coding categories are changed, you must\n\
5803 call this function")
5804   ()
5805 {
5806   int i;
5807
5808   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5809     {
5810       Lisp_Object val;
5811
5812       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5813       if (!NILP (val))
5814         {
5815           if (! coding_system_table[i])
5816             coding_system_table[i] = ((struct coding_system *)
5817                                       xmalloc (sizeof (struct coding_system)));
5818           setup_coding_system (val, coding_system_table[i]);
5819         }
5820       else if (coding_system_table[i])
5821         {
5822           xfree (coding_system_table[i]);
5823           coding_system_table[i] = NULL;
5824         }
5825     }
5826
5827   return Qnil;
5828 }
5829
5830 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5831        Sset_coding_priority_internal, 0, 0, 0,
5832   "Update internal database for the current value of `coding-category-list'.\n\
5833 This function is internal use only.")
5834   ()
5835 {
5836   int i = 0, idx;
5837   Lisp_Object val;
5838
5839   val = Vcoding_category_list;
5840
5841   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5842     {
5843       if (! SYMBOLP (XCAR (val)))
5844         break;
5845       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5846       if (idx >= CODING_CATEGORY_IDX_MAX)
5847         break;
5848       coding_priorities[i++] = (1 << idx);
5849       val = XCDR (val);
5850     }
5851   /* If coding-category-list is valid and contains all coding
5852      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5853      the following code saves Emacs from crashing.  */
5854   while (i < CODING_CATEGORY_IDX_MAX)
5855     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5856
5857   return Qnil;
5858 }
5859
5860 #endif /* emacs */
5861
5862 \f
5863 /*** 9. Post-amble ***/
5864
5865 void
5866 init_coding ()
5867 {
5868   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5869 }
5870
5871 void
5872 init_coding_once ()
5873 {
5874   int i;
5875
5876   /* Emacs' internal format specific initialize routine.  */
5877   for (i = 0; i <= 0x20; i++)
5878     emacs_code_class[i] = EMACS_control_code;
5879   emacs_code_class[0x0A] = EMACS_linefeed_code;
5880   emacs_code_class[0x0D] = EMACS_carriage_return_code;
5881   for (i = 0x21 ; i < 0x7F; i++)
5882     emacs_code_class[i] = EMACS_ascii_code;
5883   emacs_code_class[0x7F] = EMACS_control_code;
5884   for (i = 0x80; i < 0xFF; i++)
5885     emacs_code_class[i] = EMACS_invalid_code;
5886   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5887   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5888   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5889   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5890
5891   /* ISO2022 specific initialize routine.  */
5892   for (i = 0; i < 0x20; i++)
5893     iso_code_class[i] = ISO_control_0;
5894   for (i = 0x21; i < 0x7F; i++)
5895     iso_code_class[i] = ISO_graphic_plane_0;
5896   for (i = 0x80; i < 0xA0; i++)
5897     iso_code_class[i] = ISO_control_1;
5898   for (i = 0xA1; i < 0xFF; i++)
5899     iso_code_class[i] = ISO_graphic_plane_1;
5900   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5901   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5902   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5903   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5904   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5905   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5906   iso_code_class[ISO_CODE_ESC] = ISO_escape;
5907   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5908   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5909   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5910
5911   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5912
5913   setup_coding_system (Qnil, &keyboard_coding);
5914   setup_coding_system (Qnil, &terminal_coding);
5915   setup_coding_system (Qnil, &safe_terminal_coding);
5916   setup_coding_system (Qnil, &default_buffer_file_coding);
5917
5918   bzero (coding_system_table, sizeof coding_system_table);
5919
5920   bzero (ascii_skip_code, sizeof ascii_skip_code);
5921   for (i = 0; i < 128; i++)
5922     ascii_skip_code[i] = 1;
5923
5924 #if defined (MSDOS) || defined (WINDOWSNT)
5925   system_eol_type = CODING_EOL_CRLF;
5926 #else
5927   system_eol_type = CODING_EOL_LF;
5928 #endif
5929
5930   inhibit_pre_post_conversion = 0;
5931 }
5932
5933 #ifdef emacs
5934
5935 void
5936 syms_of_coding ()
5937 {
5938   Qtarget_idx = intern ("target-idx");
5939   staticpro (&Qtarget_idx);
5940
5941   Qcoding_system_history = intern ("coding-system-history");
5942   staticpro (&Qcoding_system_history);
5943   Fset (Qcoding_system_history, Qnil);
5944
5945   /* Target FILENAME is the first argument.  */
5946   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5947   /* Target FILENAME is the third argument.  */
5948   Fput (Qwrite_region, Qtarget_idx, make_number (2));
5949
5950   Qcall_process = intern ("call-process");
5951   staticpro (&Qcall_process);
5952   /* Target PROGRAM is the first argument.  */
5953   Fput (Qcall_process, Qtarget_idx, make_number (0));
5954
5955   Qcall_process_region = intern ("call-process-region");
5956   staticpro (&Qcall_process_region);
5957   /* Target PROGRAM is the third argument.  */
5958   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5959
5960   Qstart_process = intern ("start-process");
5961   staticpro (&Qstart_process);
5962   /* Target PROGRAM is the third argument.  */
5963   Fput (Qstart_process, Qtarget_idx, make_number (2));
5964
5965   Qopen_network_stream = intern ("open-network-stream");
5966   staticpro (&Qopen_network_stream);
5967   /* Target SERVICE is the fourth argument.  */
5968   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5969
5970   Qcoding_system = intern ("coding-system");
5971   staticpro (&Qcoding_system);
5972
5973   Qeol_type = intern ("eol-type");
5974   staticpro (&Qeol_type);
5975
5976   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5977   staticpro (&Qbuffer_file_coding_system);
5978
5979   Qpost_read_conversion = intern ("post-read-conversion");
5980   staticpro (&Qpost_read_conversion);
5981
5982   Qpre_write_conversion = intern ("pre-write-conversion");
5983   staticpro (&Qpre_write_conversion);
5984
5985   Qno_conversion = intern ("no-conversion");
5986   staticpro (&Qno_conversion);
5987
5988   Qundecided = intern ("undecided");
5989   staticpro (&Qundecided);
5990
5991   Qcoding_system_p = intern ("coding-system-p");
5992   staticpro (&Qcoding_system_p);
5993
5994   Qcoding_system_error = intern ("coding-system-error");
5995   staticpro (&Qcoding_system_error);
5996
5997   Fput (Qcoding_system_error, Qerror_conditions,
5998         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5999   Fput (Qcoding_system_error, Qerror_message,
6000         build_string ("Invalid coding system"));
6001
6002   Qcoding_category = intern ("coding-category");
6003   staticpro (&Qcoding_category);
6004   Qcoding_category_index = intern ("coding-category-index");
6005   staticpro (&Qcoding_category_index);
6006
6007   Vcoding_category_table
6008     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6009   staticpro (&Vcoding_category_table);
6010   {
6011     int i;
6012     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6013       {
6014         XVECTOR (Vcoding_category_table)->contents[i]
6015           = intern (coding_category_name[i]);
6016         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6017               Qcoding_category_index, make_number (i));
6018       }
6019   }
6020
6021   Qtranslation_table = intern ("translation-table");
6022   staticpro (&Qtranslation_table);
6023   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6024
6025   Qtranslation_table_id = intern ("translation-table-id");
6026   staticpro (&Qtranslation_table_id);
6027
6028   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6029   staticpro (&Qtranslation_table_for_decode);
6030
6031   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6032   staticpro (&Qtranslation_table_for_encode);
6033
6034   Qsafe_charsets = intern ("safe-charsets");
6035   staticpro (&Qsafe_charsets);
6036
6037   Qvalid_codes = intern ("valid-codes");
6038   staticpro (&Qvalid_codes);
6039
6040   Qemacs_mule = intern ("emacs-mule");
6041   staticpro (&Qemacs_mule);
6042
6043   Qraw_text = intern ("raw-text");
6044   staticpro (&Qraw_text);
6045
6046   defsubr (&Scoding_system_p);
6047   defsubr (&Sread_coding_system);
6048   defsubr (&Sread_non_nil_coding_system);
6049   defsubr (&Scheck_coding_system);
6050   defsubr (&Sdetect_coding_region);
6051   defsubr (&Sdetect_coding_string);
6052   defsubr (&Sdecode_coding_region);
6053   defsubr (&Sencode_coding_region);
6054   defsubr (&Sdecode_coding_string);
6055   defsubr (&Sencode_coding_string);
6056   defsubr (&Sdecode_sjis_char);
6057   defsubr (&Sencode_sjis_char);
6058   defsubr (&Sdecode_big5_char);
6059   defsubr (&Sencode_big5_char);
6060   defsubr (&Sset_terminal_coding_system_internal);
6061   defsubr (&Sset_safe_terminal_coding_system_internal);
6062   defsubr (&Sterminal_coding_system);
6063   defsubr (&Sset_keyboard_coding_system_internal);
6064   defsubr (&Skeyboard_coding_system);
6065   defsubr (&Sfind_operation_coding_system);
6066   defsubr (&Supdate_coding_systems_internal);
6067   defsubr (&Sset_coding_priority_internal);
6068
6069   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6070     "List of coding systems.\n\
6071 \n\
6072 Do not alter the value of this variable manually.  This variable should be\n\
6073 updated by the functions `make-coding-system' and\n\
6074 `define-coding-system-alias'.");
6075   Vcoding_system_list = Qnil;
6076
6077   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6078     "Alist of coding system names.\n\
6079 Each element is one element list of coding system name.\n\
6080 This variable is given to `completing-read' as TABLE argument.\n\
6081 \n\
6082 Do not alter the value of this variable manually.  This variable should be\n\
6083 updated by the functions `make-coding-system' and\n\
6084 `define-coding-system-alias'.");
6085   Vcoding_system_alist = Qnil;
6086
6087   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6088     "List of coding-categories (symbols) ordered by priority.");
6089   {
6090     int i;
6091
6092     Vcoding_category_list = Qnil;
6093     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6094       Vcoding_category_list
6095         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6096                  Vcoding_category_list);
6097   }
6098
6099   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6100     "Specify the coding system for read operations.\n\
6101 It is useful to bind this variable with `let', but do not set it globally.\n\
6102 If the value is a coding system, it is used for decoding on read operation.\n\
6103 If not, an appropriate element is used from one of the coding system alists:\n\
6104 There are three such tables, `file-coding-system-alist',\n\
6105 `process-coding-system-alist', and `network-coding-system-alist'.");
6106   Vcoding_system_for_read = Qnil;
6107
6108   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6109     "Specify the coding system for write operations.\n\
6110 Programs bind this variable with `let', but you should not set it globally.\n\
6111 If the value is a coding system, it is used for encoding of output,\n\
6112 when writing it to a file and when sending it to a file or subprocess.\n\
6113 \n\
6114 If this does not specify a coding system, an appropriate element\n\
6115 is used from one of the coding system alists:\n\
6116 There are three such tables, `file-coding-system-alist',\n\
6117 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6118 For output to files, if the above procedure does not specify a coding system,\n\
6119 the value of `buffer-file-coding-system' is used.");
6120   Vcoding_system_for_write = Qnil;
6121
6122   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6123     "Coding system used in the latest file or process I/O.");
6124   Vlast_coding_system_used = Qnil;
6125
6126   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6127     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6128 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6129 such conversion.");
6130   inhibit_eol_conversion = 0;
6131
6132   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6133     "Non-nil means process buffer inherits coding system of process output.\n\
6134 Bind it to t if the process output is to be treated as if it were a file\n\
6135 read from some filesystem.");
6136   inherit_process_coding_system = 0;
6137
6138   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6139     "Alist to decide a coding system to use for a file I/O operation.\n\
6140 The format is ((PATTERN . VAL) ...),\n\
6141 where PATTERN is a regular expression matching a file name,\n\
6142 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6143 If VAL is a coding system, it is used for both decoding and encoding\n\
6144 the file contents.\n\
6145 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6146 and the cdr part is used for encoding.\n\
6147 If VAL is a function symbol, the function must return a coding system\n\
6148 or a cons of coding systems which are used as above.\n\
6149 \n\
6150 See also the function `find-operation-coding-system'\n\
6151 and the variable `auto-coding-alist'.");
6152   Vfile_coding_system_alist = Qnil;
6153
6154   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6155     "Alist to decide a coding system to use for a process I/O operation.\n\
6156 The format is ((PATTERN . VAL) ...),\n\
6157 where PATTERN is a regular expression matching a program name,\n\
6158 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6159 If VAL is a coding system, it is used for both decoding what received\n\
6160 from the program and encoding what sent to the program.\n\
6161 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6162 and the cdr part is used for encoding.\n\
6163 If VAL is a function symbol, the function must return a coding system\n\
6164 or a cons of coding systems which are used as above.\n\
6165 \n\
6166 See also the function `find-operation-coding-system'.");
6167   Vprocess_coding_system_alist = Qnil;
6168
6169   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6170     "Alist to decide a coding system to use for a network I/O operation.\n\
6171 The format is ((PATTERN . VAL) ...),\n\
6172 where PATTERN is a regular expression matching a network service name\n\
6173 or is a port number to connect to,\n\
6174 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6175 If VAL is a coding system, it is used for both decoding what received\n\
6176 from the network stream and encoding what sent to the network stream.\n\
6177 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6178 and the cdr part is used for encoding.\n\
6179 If VAL is a function symbol, the function must return a coding system\n\
6180 or a cons of coding systems which are used as above.\n\
6181 \n\
6182 See also the function `find-operation-coding-system'.");
6183   Vnetwork_coding_system_alist = Qnil;
6184
6185   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6186     "Coding system to use with system messages.");
6187   Vlocale_coding_system = Qnil;
6188
6189   /* The eol mnemonics are reset in startup.el system-dependently.  */
6190   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6191     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6192   eol_mnemonic_unix = build_string (":");
6193
6194   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6195     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6196   eol_mnemonic_dos = build_string ("\\");
6197
6198   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6199     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6200   eol_mnemonic_mac = build_string ("/");
6201
6202   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6203     "*String displayed in mode line when end-of-line format is not yet determined.");
6204   eol_mnemonic_undecided = build_string (":");
6205
6206   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6207     "*Non-nil enables character translation while encoding and decoding.");
6208   Venable_character_translation = Qt;
6209
6210   DEFVAR_LISP ("standard-translation-table-for-decode",
6211     &Vstandard_translation_table_for_decode,
6212     "Table for translating characters while decoding.");
6213   Vstandard_translation_table_for_decode = Qnil;
6214
6215   DEFVAR_LISP ("standard-translation-table-for-encode",
6216     &Vstandard_translation_table_for_encode,
6217     "Table for translationg characters while encoding.");
6218   Vstandard_translation_table_for_encode = Qnil;
6219
6220   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6221     "Alist of charsets vs revision numbers.\n\
6222 While encoding, if a charset (car part of an element) is found,\n\
6223 designate it with the escape sequence identifing revision (cdr part of the element).");
6224   Vcharset_revision_alist = Qnil;
6225
6226   DEFVAR_LISP ("default-process-coding-system",
6227                &Vdefault_process_coding_system,
6228     "Cons of coding systems used for process I/O by default.\n\
6229 The car part is used for decoding a process output,\n\
6230 the cdr part is used for encoding a text to be sent to a process.");
6231   Vdefault_process_coding_system = Qnil;
6232
6233   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6234     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6235 This is a vector of length 256.\n\
6236 If Nth element is non-nil, the existence of code N in a file\n\
6237 \(or output of subprocess) doesn't prevent it to be detected as\n\
6238 a coding system of ISO 2022 variant which has a flag\n\
6239 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6240 or reading output of a subprocess.\n\
6241 Only 128th through 159th elements has a meaning.");
6242   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6243
6244   DEFVAR_LISP ("select-safe-coding-system-function",
6245                &Vselect_safe_coding_system_function,
6246     "Function to call to select safe coding system for encoding a text.\n\
6247 \n\
6248 If set, this function is called to force a user to select a proper\n\
6249 coding system which can encode the text in the case that a default\n\
6250 coding system used in each operation can't encode the text.\n\
6251 \n\
6252 The default value is `select-safe-coding-system' (which see).");
6253   Vselect_safe_coding_system_function = Qnil;
6254
6255 }
6256
6257 char *
6258 emacs_strerror (error_number)
6259      int error_number;
6260 {
6261   char *str;
6262
6263   synchronize_system_messages_locale ();
6264   str = strerror (error_number);
6265
6266   if (! NILP (Vlocale_coding_system))
6267     {
6268       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6269                                                       Vlocale_coding_system,
6270                                                       0);
6271       str = (char *) XSTRING (dec)->data;
6272     }
6273
6274   return str;
6275 }
6276
6277 #endif /* emacs */
6278