src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. CCL handlers
  29   6. End-of-line handlers
  30   7. C library functions
  31   8. Emacs Lisp library functions
  32   9. Post-amble
  33
  34 */
  35
  36 /*** GENERAL NOTE on CODING SYSTEM ***
  37
  38   Coding system is an encoding mechanism of one or more character
  39   sets.  Here's a list of coding systems which Emacs can handle.  When
  40   we say "decode", it means converting some other coding system to
  41   Emacs' internal format (emacs-internal), and when we say "encode",
  42   it means converting the coding system emacs-mule to some other
  43   coding system.
  44
  45   0. Emacs' internal format (emacs-mule)
  46
  47   Emacs itself holds a multi-lingual character in a buffer and a string
  48   in a special format.  Details are described in section 2.
  49
  50   1. ISO2022
  51
  52   The most famous coding system for multiple character sets.  X's
  53   Compound Text, various EUCs (Extended Unix Code), and coding
  54   systems used in Internet communication such as ISO-2022-JP are
  55   all variants of ISO2022.  Details are described in section 3.
  56
  57   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  58
  59   A coding system to encode character sets: ASCII, JISX0201, and
  60   JISX0208.  Widely used for PC's in Japan.  Details are described in
  61   section 4.
  62
  63   3. BIG5
  64
  65   A coding system to encode character sets: ASCII and Big5.  Widely
  66   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  67   described in section 4.  In this file, when we write "BIG5"
  68   (all uppercase), we mean the coding system, and when we write
  69   "Big5" (capitalized), we mean the character set.
  70
  71   4. Raw text
  72
  73   A coding system for a text containing random 8-bit code.  Emacs does
  74   no code conversion on such a text except for end-of-line format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is usually one byte of
  96   `carriage-return'.
  97
  98   Since text characters encoding and end-of-line encoding are
  99   independent, any coding system described above can take
 100   any format of end-of-line.  So, Emacs has information of format of
 101   end-of-line in each coding-system.  See section 6 for more details.
 102
 103 */
 104
 105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 106
 107   These functions check if a text between SRC and SRC_END is encoded
 108   in the coding system category XXX.  Each returns an integer value in
 109   which appropriate flag bits for the category XXX is set.  The flag
 110   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 111   template of these functions.  */
 112 #if 0
 113 int
 114 detect_coding_emacs_mule (src, src_end)
 115      unsigned char *src, *src_end;
 116 {
 117   ...
 118 }
 119 #endif
 120
 121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 122
 123   These functions decode SRC_BYTES length text at SOURCE encoded in
 124   CODING to Emacs' internal format (emacs-mule).  The resulting text
 125   goes to a place pointed to by DESTINATION, the length of which
 126   should not exceed DST_BYTES.  These functions set the information of
 127   original and decoded texts in the members produced, produced_char,
 128   consumed, and consumed_char of the structure *CODING.
 129
 130   The return value is an integer (CODING_FINISH_XXX) indicating how
 131   the decoding finished.
 132
 133   DST_BYTES zero means that source area and destination area are
 134   overlapped, which means that we can produce a decoded text until it
 135   reaches at the head of not-yet-decoded source text.
 136
 137   Below is a template of these functions.  */
 138 #if 0
 139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 140      struct coding_system *coding;
 141      unsigned char *source, *destination;
 142      int src_bytes, dst_bytes;
 143 {
 144   ...
 145 }
 146 #endif
 147
 148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 149
 150   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 151   internal format (emacs-mule) to CODING.  The resulting text goes to
 152   a place pointed to by DESTINATION, the length of which should not
 153   exceed DST_BYTES.  These functions set the information of
 154   original and encoded texts in the members produced, produced_char,
 155   consumed, and consumed_char of the structure *CODING.
 156
 157   The return value is an integer (CODING_FINISH_XXX) indicating how
 158   the encoding finished.
 159
 160   DST_BYTES zero means that source area and destination area are
 161   overlapped, which means that we can produce a decoded text until it
 162   reaches at the head of not-yet-decoded source text.
 163
 164   Below is a template of these functions.  */
 165 #if 0
 166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 167      struct coding_system *coding;
 168      unsigned char *source, *destination;
 169      int src_bytes, dst_bytes;
 170 {
 171   ...
 172 }
 173 #endif
 174
 175 /*** COMMONLY USED MACROS ***/
 176
 177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 178    THREE_MORE_BYTES safely get one, two, and three bytes from the
 179    source text respectively.  If there are not enough bytes in the
 180    source, they jump to `label_end_of_loop'.  The caller should set
 181    variables `src' and `src_end' to appropriate areas in advance.  */
 182
 183 #define ONE_MORE_BYTE(c1)       \
 184   do {                          \
 185     if (src < src_end)          \
 186       c1 = *src++;              \
 187     else                        \
 188       goto label_end_of_loop;   \
 189   } while (0)
 190
 191 #define TWO_MORE_BYTES(c1, c2)  \
 192   do {                          \
 193     if (src + 1 < src_end)      \
 194       c1 = *src++, c2 = *src++; \
 195     else                        \
 196       goto label_end_of_loop;   \
 197   } while (0)
 198
 199 #define THREE_MORE_BYTES(c1, c2, c3)            \
 200   do {                                          \
 201     if (src + 2 < src_end)                      \
 202       c1 = *src++, c2 = *src++, c3 = *src++;    \
 203     else                                        \
 204       goto label_end_of_loop;                   \
 205   } while (0)
 206
 207 /* The following three macros DECODE_CHARACTER_ASCII,
 208    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 209    the multi-byte form of a character of each class at the place
 210    pointed by `dst'.  The caller should set the variable `dst' to
 211    point to an appropriate area and the variable `coding' to point to
 212    the coding-system of the currently decoding text in advance.  */
 213
 214 /* Decode one ASCII character C.  */
 215
 216 #define DECODE_CHARACTER_ASCII(c)               \
 217   do {                                          \
 218     if (COMPOSING_P (coding->composing))        \
 219       {                                         \
 220         *dst++ = 0xA0, *dst++ = (c) | 0x80;     \
 221         coding->composed_chars++;               \
 222         if (((c) | 0x80) < 0xA0)                \
 223           coding->fake_multibyte = 1;           \
 224       }                                         \
 225     else                                        \
 226       {                                         \
 227         *dst++ = (c);                           \
 228         coding->produced_char++;                \
 229         if ((c) >= 0x80)                        \
 230           coding->fake_multibyte = 1;           \
 231       }                                         \
 232   } while (0)
 233
 234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 235    position-code is C.  */
 236
 237 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 238   do {                                                                  \
 239     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 240     if (COMPOSING_P (coding->composing))                                \
 241       {                                                                 \
 242         *dst++ = leading_code + 0x20;                                   \
 243         coding->composed_chars++;                                       \
 244       }                                                                 \
 245     else                                                                \
 246       {                                                                 \
 247         *dst++ = leading_code;                                          \
 248         coding->produced_char++;                                        \
 249       }                                                                 \
 250     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 251       *dst++ = leading_code;                                            \
 252     *dst++ = (c) | 0x80;                                                \
 253     if (((c) | 0x80)  < 0xA0)                                           \
 254       coding->fake_multibyte = 1;                                       \
 255   } while (0)
 256
 257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 258    position-codes are C1 and C2.  */
 259
 260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 261   do {                                                  \
 262     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 263     *dst++ = (c2) | 0x80;                               \
 264     if (((c2) | 0x80) < 0xA0)                           \
 265       coding->fake_multibyte = 1;                       \
 266   } while (0)
 267
 268 \f
 269 /*** 1. Preamble ***/
 270
 271 #include <stdio.h>
 272
 273 #ifdef emacs
 274
 275 #include <config.h>
 276 #include "lisp.h"
 277 #include "buffer.h"
 278 #include "charset.h"
 279 #include "ccl.h"
 280 #include "coding.h"
 281 #include "window.h"
 282
 283 #else  /* not emacs */
 284
 285 #include "mulelib.h"
 286
 287 #endif /* not emacs */
 288
 289 Lisp_Object Qcoding_system, Qeol_type;
 290 Lisp_Object Qbuffer_file_coding_system;
 291 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 292 Lisp_Object Qno_conversion, Qundecided;
 293 Lisp_Object Qcoding_system_history;
 294 Lisp_Object Qsafe_charsets;
 295 Lisp_Object Qvalid_codes;
 296
 297 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 298 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 299 Lisp_Object Qstart_process, Qopen_network_stream;
 300 Lisp_Object Qtarget_idx;
 301
 302 Lisp_Object Vselect_safe_coding_system_function;
 303
 304 /* Mnemonic character of each format of end-of-line.  */
 305 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 306 /* Mnemonic character to indicate format of end-of-line is not yet
 307    decided.  */
 308 int eol_mnemonic_undecided;
 309
 310 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 311    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 312 int system_eol_type;
 313
 314 #ifdef emacs
 315
 316 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 317
 318 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 319
 320 /* Coding system emacs-mule and raw-text are for converting only
 321    end-of-line format.  */
 322 Lisp_Object Qemacs_mule, Qraw_text;
 323
 324 /* Coding-systems are handed between Emacs Lisp programs and C internal
 325    routines by the following three variables.  */
 326 /* Coding-system for reading files and receiving data from process.  */
 327 Lisp_Object Vcoding_system_for_read;
 328 /* Coding-system for writing files and sending data to process.  */
 329 Lisp_Object Vcoding_system_for_write;
 330 /* Coding-system actually used in the latest I/O.  */
 331 Lisp_Object Vlast_coding_system_used;
 332
 333 /* A vector of length 256 which contains information about special
 334    Latin codes (especially for dealing with Microsoft codes).  */
 335 Lisp_Object Vlatin_extra_code_table;
 336
 337 /* Flag to inhibit code conversion of end-of-line format.  */
 338 int inhibit_eol_conversion;
 339
 340 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 341 int inherit_process_coding_system;
 342
 343 /* Coding system to be used to encode text for terminal display.  */
 344 struct coding_system terminal_coding;
 345
 346 /* Coding system to be used to encode text for terminal display when
 347    terminal coding system is nil.  */
 348 struct coding_system safe_terminal_coding;
 349
 350 /* Coding system of what is sent from terminal keyboard.  */
 351 struct coding_system keyboard_coding;
 352
 353 /* Default coding system to be used to write a file.  */
 354 struct coding_system default_buffer_file_coding;
 355
 356 Lisp_Object Vfile_coding_system_alist;
 357 Lisp_Object Vprocess_coding_system_alist;
 358 Lisp_Object Vnetwork_coding_system_alist;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qcoding_category, Qcoding_category_index;
 363
 364 /* List of symbols `coding-category-xxx' ordered by priority.  */
 365 Lisp_Object Vcoding_category_list;
 366
 367 /* Table of coding categories (Lisp symbols).  */
 368 Lisp_Object Vcoding_category_table;
 369
 370 /* Table of names of symbol for each coding-category.  */
 371 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 372   "coding-category-emacs-mule",
 373   "coding-category-sjis",
 374   "coding-category-iso-7",
 375   "coding-category-iso-7-tight",
 376   "coding-category-iso-8-1",
 377   "coding-category-iso-8-2",
 378   "coding-category-iso-7-else",
 379   "coding-category-iso-8-else",
 380   "coding-category-ccl",
 381   "coding-category-big5",
 382   "coding-category-raw-text",
 383   "coding-category-binary"
 384 };
 385
 386 /* Table of pointers to coding systems corresponding to each coding
 387    categories.  */
 388 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 389
 390 /* Table of coding category masks.  Nth element is a mask for a coding
 391    cateogry of which priority is Nth.  */
 392 static
 393 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 Lisp_Object Vcharset_revision_alist;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 \f
 415 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 416
 417 /* Emacs' internal format for encoding multiple character sets is a
 418    kind of multi-byte encoding, i.e. characters are encoded by
 419    variable-length sequences of one-byte codes.  ASCII characters
 420    and control characters (e.g. `tab', `newline') are represented by
 421    one-byte sequences which are their ASCII codes, in the range 0x00
 422    through 0x7F.  The other characters are represented by a sequence
 423    of `base leading-code', optional `extended leading-code', and one
 424    or two `position-code's.  The length of the sequence is determined
 425    by the base leading-code.  Leading-code takes the range 0x80
 426    through 0x9F, whereas extended leading-code and position-code take
 427    the range 0xA0 through 0xFF.  See `charset.h' for more details
 428    about leading-code and position-code.
 429
 430    There's one exception to this rule.  Special leading-code
 431    `leading-code-composition' denotes that the following several
 432    characters should be composed into one character.  Leading-codes of
 433    components (except for ASCII) are added 0x20.  An ASCII character
 434    component is represented by a 2-byte sequence of `0xA0' and
 435    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 436    details of composite character.  Hence, we can summarize the code
 437    range as follows:
 438
 439    --- CODE RANGE of Emacs' internal format ---
 440    (character set)      (range)
 441    ASCII                0x00 .. 0x7F
 442    ELSE (1st byte)      0x80 .. 0x9F
 443         (rest bytes)    0xA0 .. 0xFF
 444    ---------------------------------------------
 445
 446   */
 447
 448 enum emacs_code_class_type emacs_code_class[256];
 449
 450 /* Go to the next statement only if *SRC is accessible and the code is
 451    greater than 0xA0.  */
 452 #define CHECK_CODE_RANGE_A0_FF  \
 453   do {                          \
 454     if (src >= src_end)         \
 455       goto label_end_of_switch; \
 456     else if (*src++ < 0xA0)     \
 457       return 0;                 \
 458   } while (0)
 459
 460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 461    Check if a text is encoded in Emacs' internal format.  If it is,
 462    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 463
 464 int
 465 detect_coding_emacs_mule (src, src_end)
 466      unsigned char *src, *src_end;
 467 {
 468   unsigned char c;
 469   int composing = 0;
 470
 471   while (src < src_end)
 472     {
 473       c = *src++;
 474
 475       if (composing)
 476         {
 477           if (c < 0xA0)
 478             composing = 0;
 479           else
 480             c -= 0x20;
 481         }
 482
 483       switch (emacs_code_class[c])
 484         {
 485         case EMACS_ascii_code:
 486         case EMACS_linefeed_code:
 487           break;
 488
 489         case EMACS_control_code:
 490           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 491             return 0;
 492           break;
 493
 494         case EMACS_invalid_code:
 495           return 0;
 496
 497         case EMACS_leading_code_composition: /* c == 0x80 */
 498           if (composing)
 499             CHECK_CODE_RANGE_A0_FF;
 500           else
 501             composing = 1;
 502           break;
 503
 504         case EMACS_leading_code_4:
 505           CHECK_CODE_RANGE_A0_FF;
 506           /* fall down to check it two more times ...  */
 507
 508         case EMACS_leading_code_3:
 509           CHECK_CODE_RANGE_A0_FF;
 510           /* fall down to check it one more time ...  */
 511
 512         case EMACS_leading_code_2:
 513           CHECK_CODE_RANGE_A0_FF;
 514           break;
 515
 516         default:
 517         label_end_of_switch:
 518           break;
 519         }
 520     }
 521   return CODING_CATEGORY_MASK_EMACS_MULE;
 522 }
 523
 524 \f
 525 /*** 3. ISO2022 handlers ***/
 526
 527 /* The following note describes the coding system ISO2022 briefly.
 528    Since the intention of this note is to help in understanding of
 529    the programs in this file, some parts are NOT ACCURATE or OVERLY
 530    SIMPLIFIED.  For the thorough understanding, please refer to the
 531    original document of ISO2022.
 532
 533    ISO2022 provides many mechanisms to encode several character sets
 534    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 535    all text is encoded by codes of less than 128.  This may make the
 536    encoded text a little bit longer, but the text gets more stability
 537    to pass through several gateways (some of them strip off the MSB).
 538
 539    There are two kinds of character set: control character set and
 540    graphic character set.  The former contains control characters such
 541    as `newline' and `escape' to provide control functions (control
 542    functions are provided also by escape sequences).  The latter
 543    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 544    two control character sets and many graphic character sets.
 545
 546    Graphic character sets are classified into one of the following
 547    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 548    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 549    bytes (DIMENSION) and the number of characters in one dimension
 550    (CHARS) of the set.  In addition, each character set is assigned an
 551    identification tag (called "final character" and denoted as <F>
 552    here after) which is unique in each class.  <F> of each character
 553    set is decided by ECMA(*) when it is registered in ISO.  Code range
 554    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 555
 556    Note (*): ECMA = European Computer Manufacturers Association
 557
 558    Here are examples of graphic character set [NAME(<F>)]:
 559         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 560         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 561         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 562         o DIMENSION2_CHARS96 -- none for the moment
 563
 564    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 565         C0 [0x00..0x1F] -- control character plane 0
 566         GL [0x20..0x7F] -- graphic character plane 0
 567         C1 [0x80..0x9F] -- control character plane 1
 568         GR [0xA0..0xFF] -- graphic character plane 1
 569
 570    A control character set is directly designated and invoked to C0 or
 571    C1 by an escape sequence.  The most common case is that ISO646's
 572    control character set is designated/invoked to C0 and ISO6429's
 573    control character set is designated/invoked to C1, and usually
 574    these designations/invocations are omitted in a coded text.  With
 575    7-bit environment, only C0 can be used, and a control character for
 576    C1 is encoded by an appropriate escape sequence to fit in the
 577    environment.  All control characters for C1 are defined the
 578    corresponding escape sequences.
 579
 580    A graphic character set is at first designated to one of four
 581    graphic registers (G0 through G3), then these graphic registers are
 582    invoked to GL or GR.  These designations and invocations can be
 583    done independently.  The most common case is that G0 is invoked to
 584    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 585    these invocations and designations are omitted in a coded text.
 586    With 7-bit environment, only GL can be used.
 587
 588    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 589    and 0x7F of GL area work as control characters SPACE and DEL
 590    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 591
 592    There are two ways of invocation: locking-shift and single-shift.
 593    With locking-shift, the invocation lasts until the next different
 594    invocation, whereas with single-shift, the invocation works only
 595    for the following character and doesn't affect locking-shift.
 596    Invocations are done by the following control characters or escape
 597    sequences.
 598
 599    ----------------------------------------------------------------------
 600    function             control char    escape sequence description
 601    ----------------------------------------------------------------------
 602    SI  (shift-in)               0x0F    none            invoke G0 to GL
 603    SO  (shift-out)              0x0E    none            invoke G1 to GL
 604    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 605    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 606    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 607    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 608    ----------------------------------------------------------------------
 609    The first four are for locking-shift.  Control characters for these
 610    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 611
 612    Designations are done by the following escape sequences.
 613    ----------------------------------------------------------------------
 614    escape sequence      description
 615    ----------------------------------------------------------------------
 616    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 617    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 618    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 619    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 620    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 621    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 622    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 623    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 624    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 625    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 626    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 627    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 628    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 629    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 630    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 631    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 632    ----------------------------------------------------------------------
 633
 634    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 635    of dimension 1, chars 94, and final character <F>, and etc.
 636
 637    Note (*): Although these designations are not allowed in ISO2022,
 638    Emacs accepts them on decoding, and produces them on encoding
 639    CHARS96 character set in a coding system which is characterized as
 640    7-bit environment, non-locking-shift, and non-single-shift.
 641
 642    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 643    '(' can be omitted.  We call this as "short-form" here after.
 644
 645    Now you may notice that there are a lot of ways for encoding the
 646    same multilingual text in ISO2022.  Actually, there exists many
 647    coding systems such as Compound Text (used in X's inter client
 648    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 649    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 650    localized platforms), and all of these are variants of ISO2022.
 651
 652    In addition to the above, Emacs handles two more kinds of escape
 653    sequences: ISO6429's direction specification and Emacs' private
 654    sequence for specifying character composition.
 655
 656    ISO6429's direction specification takes the following format:
 657         o CSI ']'      -- end of the current direction
 658         o CSI '0' ']'  -- end of the current direction
 659         o CSI '1' ']'  -- start of left-to-right text
 660         o CSI '2' ']'  -- start of right-to-left text
 661    The control character CSI (0x9B: control sequence introducer) is
 662    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 663
 664    Character composition specification takes the following format:
 665         o ESC '0' -- start character composition
 666         o ESC '1' -- end character composition
 667    Since these are not standard escape sequences of any ISO, the use
 668    of them for these meaning is restricted to Emacs only.  */
 669
 670 enum iso_code_class_type iso_code_class[256];
 671
 672 #define CHARSET_OK(idx, charset)                                \
 673   (coding_system_table[idx]                                     \
 674    && (coding_system_table[idx]->safe_charsets[charset]         \
 675        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 676             (coding_system_table[idx], charset)                 \
 677            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 678
 679 #define SHIFT_OUT_OK(idx) \
 680   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 681
 682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 683    Check if a text is encoded in ISO2022.  If it is, returns an
 684    integer in which appropriate flag bits any of:
 685         CODING_CATEGORY_MASK_ISO_7
 686         CODING_CATEGORY_MASK_ISO_7_TIGHT
 687         CODING_CATEGORY_MASK_ISO_8_1
 688         CODING_CATEGORY_MASK_ISO_8_2
 689         CODING_CATEGORY_MASK_ISO_7_ELSE
 690         CODING_CATEGORY_MASK_ISO_8_ELSE
 691    are set.  If a code which should never appear in ISO2022 is found,
 692    returns 0.  */
 693
 694 int
 695 detect_coding_iso2022 (src, src_end)
 696      unsigned char *src, *src_end;
 697 {
 698   int mask = CODING_CATEGORY_MASK_ISO;
 699   int mask_found = 0;
 700   int reg[4], shift_out = 0, single_shifting = 0;
 701   int c, c1, i, charset;
 702
 703   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 704   while (mask && src < src_end)
 705     {
 706       c = *src++;
 707       switch (c)
 708         {
 709         case ISO_CODE_ESC:
 710           single_shifting = 0;
 711           if (src >= src_end)
 712             break;
 713           c = *src++;
 714           if (c >= '(' && c <= '/')
 715             {
 716               /* Designation sequence for a charset of dimension 1.  */
 717               if (src >= src_end)
 718                 break;
 719               c1 = *src++;
 720               if (c1 < ' ' || c1 >= 0x80
 721                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 722                 /* Invalid designation sequence.  Just ignore.  */
 723                 break;
 724               reg[(c - '(') % 4] = charset;
 725             }
 726           else if (c == '$')
 727             {
 728               /* Designation sequence for a charset of dimension 2.  */
 729               if (src >= src_end)
 730                 break;
 731               c = *src++;
 732               if (c >= '@' && c <= 'B')
 733                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 734                 reg[0] = charset = iso_charset_table[1][0][c];
 735               else if (c >= '(' && c <= '/')
 736                 {
 737                   if (src >= src_end)
 738                     break;
 739                   c1 = *src++;
 740                   if (c1 < ' ' || c1 >= 0x80
 741                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 742                     /* Invalid designation sequence.  Just ignore.  */
 743                     break;
 744                   reg[(c - '(') % 4] = charset;
 745                 }
 746               else
 747                 /* Invalid designation sequence.  Just ignore.  */
 748                 break;
 749             }
 750           else if (c == 'N' || c == 'O')
 751             {
 752               /* ESC <Fe> for SS2 or SS3.  */
 753               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 754               break;
 755             }
 756           else if (c == '0' || c == '1' || c == '2')
 757             /* ESC <Fp> for start/end composition.  Just ignore.  */
 758             break;
 759           else
 760             /* Invalid escape sequence.  Just ignore.  */
 761             break;
 762
 763           /* We found a valid designation sequence for CHARSET.  */
 764           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 765           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 766             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 767           else
 768             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 769           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 770             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 771           else
 772             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 773           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 774             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 775           else
 776             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 777           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 778             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 779           else
 780             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 781           break;
 782
 783         case ISO_CODE_SO:
 784           single_shifting = 0;
 785           if (shift_out == 0
 786               && (reg[1] >= 0
 787                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 788                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 789             {
 790               /* Locking shift out.  */
 791               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 792               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 793             }
 794           break;
 795
 796         case ISO_CODE_SI:
 797           single_shifting = 0;
 798           if (shift_out == 1)
 799             {
 800               /* Locking shift in.  */
 801               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 802               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 803             }
 804           break;
 805
 806         case ISO_CODE_CSI:
 807           single_shifting = 0;
 808         case ISO_CODE_SS2:
 809         case ISO_CODE_SS3:
 810           {
 811             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 812
 813             if (c != ISO_CODE_CSI)
 814               {
 815                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 816                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 817                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 818                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 819                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 820                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 821                 single_shifting = 1;
 822               }
 823             if (VECTORP (Vlatin_extra_code_table)
 824                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 825               {
 826                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 827                     & CODING_FLAG_ISO_LATIN_EXTRA)
 828                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 829                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 830                     & CODING_FLAG_ISO_LATIN_EXTRA)
 831                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 832               }
 833             mask &= newmask;
 834             mask_found |= newmask;
 835           }
 836           break;
 837
 838         default:
 839           if (c < 0x80)
 840             {
 841               single_shifting = 0;
 842               break;
 843             }
 844           else if (c < 0xA0)
 845             {
 846               single_shifting = 0;
 847               if (VECTORP (Vlatin_extra_code_table)
 848                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 849                 {
 850                   int newmask = 0;
 851
 852                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 853                       & CODING_FLAG_ISO_LATIN_EXTRA)
 854                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 855                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 856                       & CODING_FLAG_ISO_LATIN_EXTRA)
 857                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 858                   mask &= newmask;
 859                   mask_found |= newmask;
 860                 }
 861               else
 862                 return 0;
 863             }
 864           else
 865             {
 866               unsigned char *src_begin = src;
 867
 868               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 869                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 870               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 871               /* Check the length of succeeding codes of the range
 872                  0xA0..0FF.  If the byte length is odd, we exclude
 873                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 874                  when we are not single shifting.  */
 875               if (!single_shifting)
 876                 {
 877                   while (src < src_end && *src >= 0xA0)
 878                     src++;
 879                   if ((src - src_begin - 1) & 1 && src < src_end)
 880                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 881                   else
 882                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
 883                 }
 884             }
 885           break;
 886         }
 887     }
 888
 889   return (mask & mask_found);
 890 }
 891
 892 /* Decode a character of which charset is CHARSET and the 1st position
 893    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 894    fetched from SRC and set to C2.  If CHARSET is negative, it means
 895    that we are decoding ill formed text, and what we can do is just to
 896    read C1 as is.  */
 897
 898 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 899   do {                                                                  \
 900     int c_alt, charset_alt = (charset);                                 \
 901     if (COMPOSING_HEAD_P (coding->composing))                           \
 902       {                                                                 \
 903         *dst++ = LEADING_CODE_COMPOSITION;                              \
 904         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 905           /* To tell composition rules are embeded.  */                 \
 906           *dst++ = 0xFF;                                                \
 907         coding->composing += 2;                                         \
 908       }                                                                 \
 909     if (charset_alt >= 0)                                               \
 910       {                                                                 \
 911         if (CHARSET_DIMENSION (charset_alt) == 2)                       \
 912           {                                                             \
 913             ONE_MORE_BYTE (c2);                                         \
 914             if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F         \
 915                 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0)  \
 916               {                                                         \
 917                 src--;                                                  \
 918                 charset_alt = CHARSET_ASCII;                            \
 919               }                                                         \
 920           }                                                             \
 921         if (!NILP (translation_table)                                   \
 922             && ((c_alt = translate_char (translation_table,             \
 923                                          -1, charset_alt, c1, c2)) >= 0)) \
 924           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 925       }                                                                 \
 926     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 927       DECODE_CHARACTER_ASCII (c1);                                      \
 928     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 929       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 930     else                                                                \
 931       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 932     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 933       /* To tell a composition rule follows.  */                        \
 934       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 935   } while (0)
 936
 937 /* Set designation state into CODING.  */
 938 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
 939   do {                                                                     \
 940     int charset = ISO_CHARSET_TABLE (make_number (dimension),              \
 941                                      make_number (chars),                  \
 942                                      make_number (final_char));            \
 943     if (charset >= 0                                                       \
 944         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
 945             || coding->safe_charsets[charset]))                            \
 946       {                                                                    \
 947         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
 948             && reg == 0                                                    \
 949             && charset == CHARSET_ASCII)                                   \
 950           {                                                                \
 951             /* We should insert this designation sequence as is so         \
 952                that it is surely written back to a file.  */               \
 953             coding->spec.iso2022.last_invalid_designation_register = -1;   \
 954             goto label_invalid_code;                                       \
 955           }                                                                \
 956         coding->spec.iso2022.last_invalid_designation_register = -1;       \
 957         if ((coding->mode & CODING_MODE_DIRECTION)                         \
 958             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
 959           charset = CHARSET_REVERSE_CHARSET (charset);                     \
 960         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
 961       }                                                                    \
 962     else                                                                   \
 963       {                                                                    \
 964         coding->spec.iso2022.last_invalid_designation_register = reg;      \
 965         goto label_invalid_code;                                           \
 966       }                                                                    \
 967   } while (0)
 968
 969 /* Return 0 if there's a valid composing sequence starting at SRC and
 970    ending before SRC_END, else return -1.  */
 971
 972 int
 973 check_composing_code (coding, src, src_end)
 974      struct coding_system *coding;
 975      unsigned char *src, *src_end;
 976 {
 977   int charset, c, c1, dim;
 978
 979   while (src < src_end)
 980     {
 981       c = *src++;
 982       if (c >= 0x20)
 983         continue;
 984       if (c != ISO_CODE_ESC || src >= src_end)
 985         return -1;
 986       c = *src++;
 987       if (c == '1') /* end of compsition */
 988         return 0;
 989       if (src + 2 >= src_end
 990           || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
 991         return -1;
 992
 993       dim = (c == '$');
 994       if (dim == 1)
 995         c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
 996       if (c >= '(' && c <= '/')
 997         {
 998           c1 = *src++;
 999           if ((c1 < ' ' || c1 >= 0x80)
1000               || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1001               || ! coding->safe_charsets[charset]
1002               || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1003                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1004             return -1;
1005         }
1006       else
1007         return -1;
1008     }
1009
1010   /* We have not found the sequence "ESC 1".  */
1011   return -1;
1012 }
1013
1014 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1015
1016 int
1017 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1018      struct coding_system *coding;
1019      unsigned char *source, *destination;
1020      int src_bytes, dst_bytes;
1021 {
1022   unsigned char *src = source;
1023   unsigned char *src_end = source + src_bytes;
1024   unsigned char *dst = destination;
1025   unsigned char *dst_end = destination + dst_bytes;
1026   /* Since the maximum bytes produced by each loop is 7, we subtract 6
1027      from DST_END to assure that overflow checking is necessary only
1028      at the head of loop.  */
1029   unsigned char *adjusted_dst_end = dst_end - 6;
1030   int charset;
1031   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1032   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1033   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1034   Lisp_Object translation_table
1035     = coding->translation_table_for_decode;
1036   int result = CODING_FINISH_NORMAL;
1037
1038   if (!NILP (Venable_character_translation) && NILP (translation_table))
1039     translation_table = Vstandard_translation_table_for_decode;
1040
1041   coding->produced_char = 0;
1042   coding->composed_chars = 0;
1043   coding->fake_multibyte = 0;
1044   while (src < src_end && (dst_bytes
1045                            ? (dst < adjusted_dst_end)
1046                            : (dst < src - 6)))
1047     {
1048       /* SRC_BASE remembers the start position in source in each loop.
1049          The loop will be exited when there's not enough source text
1050          to analyze long escape sequence or 2-byte code (within macros
1051          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
1052          to SRC_BASE before exiting.  */
1053       unsigned char *src_base = src;
1054       int c1 = *src++, c2;
1055
1056       switch (iso_code_class [c1])
1057         {
1058         case ISO_0x20_or_0x7F:
1059           if (!coding->composing
1060               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1061             {
1062               /* This is SPACE or DEL.  */
1063               *dst++ = c1;
1064               coding->produced_char++;
1065               break;
1066             }
1067           /* This is a graphic character, we fall down ...  */
1068
1069         case ISO_graphic_plane_0:
1070           if (coding->composing == COMPOSING_WITH_RULE_RULE)
1071             {
1072               /* This is a composition rule.  */
1073               *dst++ = c1 | 0x80;
1074               coding->composing = COMPOSING_WITH_RULE_TAIL;
1075             }
1076           else
1077             DECODE_ISO_CHARACTER (charset0, c1);
1078           break;
1079
1080         case ISO_0xA0_or_0xFF:
1081           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1082               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1083             goto label_invalid_code;
1084           /* This is a graphic character, we fall down ... */
1085
1086         case ISO_graphic_plane_1:
1087           if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1088             goto label_invalid_code;
1089           else
1090             DECODE_ISO_CHARACTER (charset1, c1);
1091           break;
1092
1093         case ISO_control_code:
1094           /* All ISO2022 control characters in this class have the
1095              same representation in Emacs internal format.  */
1096           if (c1 == '\n'
1097               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1098               && (coding->eol_type == CODING_EOL_CR
1099                   || coding->eol_type == CODING_EOL_CRLF))
1100             {
1101               result = CODING_FINISH_INCONSISTENT_EOL;
1102               goto label_end_of_loop_2;
1103             }
1104           *dst++ = c1;
1105           coding->produced_char++;
1106           if (c1 >= 0x80)
1107             coding->fake_multibyte = 1;
1108           break;
1109
1110         case ISO_carriage_return:
1111           if (coding->eol_type == CODING_EOL_CR)
1112             *dst++ = '\n';
1113           else if (coding->eol_type == CODING_EOL_CRLF)
1114             {
1115               ONE_MORE_BYTE (c1);
1116               if (c1 == ISO_CODE_LF)
1117                 *dst++ = '\n';
1118               else
1119                 {
1120                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1121                     {
1122                       result = CODING_FINISH_INCONSISTENT_EOL;
1123                       goto label_end_of_loop_2;
1124                     }
1125                   src--;
1126                   *dst++ = '\r';
1127                 }
1128             }
1129           else
1130             *dst++ = c1;
1131           coding->produced_char++;
1132           break;
1133
1134         case ISO_shift_out:
1135           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1136               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1137             goto label_invalid_code;
1138           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1139           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1140           break;
1141
1142         case ISO_shift_in:
1143           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1144             goto label_invalid_code;
1145           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1146           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1147           break;
1148
1149         case ISO_single_shift_2_7:
1150         case ISO_single_shift_2:
1151           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1152             goto label_invalid_code;
1153           /* SS2 is handled as an escape sequence of ESC 'N' */
1154           c1 = 'N';
1155           goto label_escape_sequence;
1156
1157         case ISO_single_shift_3:
1158           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1159             goto label_invalid_code;
1160           /* SS2 is handled as an escape sequence of ESC 'O' */
1161           c1 = 'O';
1162           goto label_escape_sequence;
1163
1164         case ISO_control_sequence_introducer:
1165           /* CSI is handled as an escape sequence of ESC '[' ...  */
1166           c1 = '[';
1167           goto label_escape_sequence;
1168
1169         case ISO_escape:
1170           ONE_MORE_BYTE (c1);
1171         label_escape_sequence:
1172           /* Escape sequences handled by Emacs are invocation,
1173              designation, direction specification, and character
1174              composition specification.  */
1175           switch (c1)
1176             {
1177             case '&':           /* revision of following character set */
1178               ONE_MORE_BYTE (c1);
1179               if (!(c1 >= '@' && c1 <= '~'))
1180                 goto label_invalid_code;
1181               ONE_MORE_BYTE (c1);
1182               if (c1 != ISO_CODE_ESC)
1183                 goto label_invalid_code;
1184               ONE_MORE_BYTE (c1);
1185               goto label_escape_sequence;
1186
1187             case '$':           /* designation of 2-byte character set */
1188               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1189                 goto label_invalid_code;
1190               ONE_MORE_BYTE (c1);
1191               if (c1 >= '@' && c1 <= 'B')
1192                 {       /* designation of JISX0208.1978, GB2312.1980,
1193                            or JISX0208.1980 */
1194                   DECODE_DESIGNATION (0, 2, 94, c1);
1195                 }
1196               else if (c1 >= 0x28 && c1 <= 0x2B)
1197                 {       /* designation of DIMENSION2_CHARS94 character set */
1198                   ONE_MORE_BYTE (c2);
1199                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1200                 }
1201               else if (c1 >= 0x2C && c1 <= 0x2F)
1202                 {       /* designation of DIMENSION2_CHARS96 character set */
1203                   ONE_MORE_BYTE (c2);
1204                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1205                 }
1206               else
1207                 goto label_invalid_code;
1208               break;
1209
1210             case 'n':           /* invocation of locking-shift-2 */
1211               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1212                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1213                 goto label_invalid_code;
1214               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1215               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1216               break;
1217
1218             case 'o':           /* invocation of locking-shift-3 */
1219               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1220                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1221                 goto label_invalid_code;
1222               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1223               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1224               break;
1225
1226             case 'N':           /* invocation of single-shift-2 */
1227               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1228                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1229                 goto label_invalid_code;
1230               ONE_MORE_BYTE (c1);
1231               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1232               DECODE_ISO_CHARACTER (charset, c1);
1233               break;
1234
1235             case 'O':           /* invocation of single-shift-3 */
1236               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1237                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1238                 goto label_invalid_code;
1239               ONE_MORE_BYTE (c1);
1240               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1241               DECODE_ISO_CHARACTER (charset, c1);
1242               break;
1243
1244             case '0': case '2': /* start composing */
1245               /* Before processing composing, we must be sure that all
1246                  characters being composed are supported by CODING.
1247                  If not, we must give up composing.  */
1248               if (check_composing_code (coding, src, src_end) == 0)
1249                 {
1250                   /* We are looking at a valid composition sequence.  */
1251                   coding->composing = (c1 == '0'
1252                                        ? COMPOSING_NO_RULE_HEAD
1253                                        : COMPOSING_WITH_RULE_HEAD);
1254                   coding->composed_chars = 0;
1255                 }
1256               else
1257                 {
1258                   *dst++ = ISO_CODE_ESC;
1259                   *dst++ = c1;
1260                   coding->produced_char += 2;
1261                 }
1262               break;
1263
1264             case '1':           /* end composing */
1265               if (!coding->composing)
1266                 {
1267                   *dst++ = ISO_CODE_ESC;
1268                   *dst++ = c1;
1269                   coding->produced_char += 2;
1270                   break;
1271                 }
1272
1273               if (coding->composed_chars > 0)
1274                 {
1275                   if (coding->composed_chars == 1)
1276                     {
1277                       unsigned char *this_char_start = dst;
1278                       int this_bytes;
1279
1280                       /* Only one character is in the composing
1281                          sequence.  Make it a normal character.  */
1282                       while (*--this_char_start != LEADING_CODE_COMPOSITION);
1283                       dst = (this_char_start
1284                              + (coding->composing == COMPOSING_NO_RULE_TAIL
1285                                 ? 1 : 2));
1286                       *dst -= 0x20;
1287                       if (*dst == 0x80)
1288                         *++dst &= 0x7F;
1289                       this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1290                       while (this_bytes--) *this_char_start++ = *dst++;
1291                       dst = this_char_start;
1292                     }
1293                   coding->produced_char++;
1294                 }
1295               coding->composing = COMPOSING_NO;
1296               break;
1297
1298             case '[':           /* specification of direction */
1299               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1300                 goto label_invalid_code;
1301               /* For the moment, nested direction is not supported.
1302                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1303                  left-to-right, and nozero means right-to-left.  */
1304               ONE_MORE_BYTE (c1);
1305               switch (c1)
1306                 {
1307                 case ']':       /* end of the current direction */
1308                   coding->mode &= ~CODING_MODE_DIRECTION;
1309
1310                 case '0':       /* end of the current direction */
1311                 case '1':       /* start of left-to-right direction */
1312                   ONE_MORE_BYTE (c1);
1313                   if (c1 == ']')
1314                     coding->mode &= ~CODING_MODE_DIRECTION;
1315                   else
1316                     goto label_invalid_code;
1317                   break;
1318
1319                 case '2':       /* start of right-to-left direction */
1320                   ONE_MORE_BYTE (c1);
1321                   if (c1 == ']')
1322                     coding->mode |= CODING_MODE_DIRECTION;
1323                   else
1324                     goto label_invalid_code;
1325                   break;
1326
1327                 default:
1328                   goto label_invalid_code;
1329                 }
1330               break;
1331
1332             default:
1333               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1334                 goto label_invalid_code;
1335               if (c1 >= 0x28 && c1 <= 0x2B)
1336                 {       /* designation of DIMENSION1_CHARS94 character set */
1337                   ONE_MORE_BYTE (c2);
1338                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1339                 }
1340               else if (c1 >= 0x2C && c1 <= 0x2F)
1341                 {       /* designation of DIMENSION1_CHARS96 character set */
1342                   ONE_MORE_BYTE (c2);
1343                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1344                 }
1345               else
1346                 {
1347                   goto label_invalid_code;
1348                 }
1349             }
1350           /* We must update these variables now.  */
1351           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1352           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1353           break;
1354
1355         label_invalid_code:
1356           while (src_base < src)
1357             *dst++ = *src_base++;
1358           coding->fake_multibyte = 1;
1359         }
1360       continue;
1361
1362     label_end_of_loop:
1363       result = CODING_FINISH_INSUFFICIENT_SRC;
1364     label_end_of_loop_2:
1365       src = src_base;
1366       break;
1367     }
1368
1369   if (src < src_end)
1370     {
1371       if (result == CODING_FINISH_NORMAL)
1372         result = CODING_FINISH_INSUFFICIENT_DST;
1373       else if (result != CODING_FINISH_INCONSISTENT_EOL
1374                && coding->mode & CODING_MODE_LAST_BLOCK)
1375         {
1376           /* This is the last block of the text to be decoded.  We had
1377              better just flush out all remaining codes in the text
1378              although they are not valid characters.  */
1379           src_bytes = src_end - src;
1380           if (dst_bytes && (dst_end - dst < src_bytes))
1381             src_bytes = dst_end - dst;
1382           bcopy (src, dst, src_bytes);
1383           dst += src_bytes;
1384           src += src_bytes;
1385           coding->fake_multibyte = 1;
1386         }
1387     }
1388
1389   coding->consumed = coding->consumed_char = src - source;
1390   coding->produced = dst - destination;
1391   return result;
1392 }
1393
1394 /* ISO2022 encoding stuff.  */
1395
1396 /*
1397    It is not enough to say just "ISO2022" on encoding, we have to
1398    specify more details.  In Emacs, each coding system of ISO2022
1399    variant has the following specifications:
1400         1. Initial designation to G0 thru G3.
1401         2. Allows short-form designation?
1402         3. ASCII should be designated to G0 before control characters?
1403         4. ASCII should be designated to G0 at end of line?
1404         5. 7-bit environment or 8-bit environment?
1405         6. Use locking-shift?
1406         7. Use Single-shift?
1407    And the following two are only for Japanese:
1408         8. Use ASCII in place of JIS0201-1976-Roman?
1409         9. Use JISX0208-1983 in place of JISX0208-1978?
1410    These specifications are encoded in `coding->flags' as flag bits
1411    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1412    details.
1413 */
1414
1415 /* Produce codes (escape sequence) for designating CHARSET to graphic
1416    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1417    the coding system CODING allows, produce designation sequence of
1418    short-form.  */
1419
1420 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1421   do {                                                                  \
1422     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1423     char *intermediate_char_94 = "()*+";                                \
1424     char *intermediate_char_96 = ",-./";                                \
1425     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1426     if (revision < 255)                                                 \
1427       {                                                                 \
1428         *dst++ = ISO_CODE_ESC;                                          \
1429         *dst++ = '&';                                                   \
1430         *dst++ = '@' + revision;                                        \
1431       }                                                                 \
1432     *dst++ = ISO_CODE_ESC;                                              \
1433     if (CHARSET_DIMENSION (charset) == 1)                               \
1434       {                                                                 \
1435         if (CHARSET_CHARS (charset) == 94)                              \
1436           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1437         else                                                            \
1438           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1439       }                                                                 \
1440     else                                                                \
1441       {                                                                 \
1442         *dst++ = '$';                                                   \
1443         if (CHARSET_CHARS (charset) == 94)                              \
1444           {                                                             \
1445             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1446                 || reg != 0                                             \
1447                 || final_char < '@' || final_char > 'B')                \
1448               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1449           }                                                             \
1450         else                                                            \
1451           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1452       }                                                                 \
1453     *dst++ = final_char;                                                \
1454     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1455   } while (0)
1456
1457 /* The following two macros produce codes (control character or escape
1458    sequence) for ISO2022 single-shift functions (single-shift-2 and
1459    single-shift-3).  */
1460
1461 #define ENCODE_SINGLE_SHIFT_2                           \
1462   do {                                                  \
1463     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1464       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1465     else                                                \
1466       {                                                 \
1467         *dst++ = ISO_CODE_SS2;                          \
1468         coding->fake_multibyte = 1;                     \
1469       }                                                 \
1470     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1471   } while (0)
1472
1473 #define ENCODE_SINGLE_SHIFT_3                           \
1474   do {                                                  \
1475     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1476       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1477     else                                                \
1478       {                                                 \
1479         *dst++ = ISO_CODE_SS3;                          \
1480         coding->fake_multibyte = 1;                     \
1481       }                                                 \
1482     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1483   } while (0)
1484
1485 /* The following four macros produce codes (control character or
1486    escape sequence) for ISO2022 locking-shift functions (shift-in,
1487    shift-out, locking-shift-2, and locking-shift-3).  */
1488
1489 #define ENCODE_SHIFT_IN                         \
1490   do {                                          \
1491     *dst++ = ISO_CODE_SI;                       \
1492     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1493   } while (0)
1494
1495 #define ENCODE_SHIFT_OUT                        \
1496   do {                                          \
1497     *dst++ = ISO_CODE_SO;                       \
1498     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1499   } while (0)
1500
1501 #define ENCODE_LOCKING_SHIFT_2                  \
1502   do {                                          \
1503     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1504     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1505   } while (0)
1506
1507 #define ENCODE_LOCKING_SHIFT_3                  \
1508   do {                                          \
1509     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1510     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1511   } while (0)
1512
1513 /* Produce codes for a DIMENSION1 character whose character set is
1514    CHARSET and whose position-code is C1.  Designation and invocation
1515    sequences are also produced in advance if necessary.  */
1516
1517
1518 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1519   do {                                                                  \
1520     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1521       {                                                                 \
1522         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1523           *dst++ = c1 & 0x7F;                                           \
1524         else                                                            \
1525           *dst++ = c1 | 0x80;                                           \
1526         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1527         break;                                                          \
1528       }                                                                 \
1529     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1530       {                                                                 \
1531         *dst++ = c1 & 0x7F;                                             \
1532         break;                                                          \
1533       }                                                                 \
1534     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1535       {                                                                 \
1536         *dst++ = c1 | 0x80;                                             \
1537         break;                                                          \
1538       }                                                                 \
1539     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1540              && !coding->safe_charsets[charset])                        \
1541       {                                                                 \
1542         /* We should not encode this character, instead produce one or  \
1543            two `?'s.  */                                                \
1544         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1545         if (CHARSET_WIDTH (charset) == 2)                               \
1546           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1547         break;                                                          \
1548       }                                                                 \
1549     else                                                                \
1550       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1551          must invoke it, or, at first, designate it to some graphic     \
1552          register.  Then repeat the loop to actually produce the        \
1553          character.  */                                                 \
1554       dst = encode_invocation_designation (charset, coding, dst);       \
1555   } while (1)
1556
1557 /* Produce codes for a DIMENSION2 character whose character set is
1558    CHARSET and whose position-codes are C1 and C2.  Designation and
1559    invocation codes are also produced in advance if necessary.  */
1560
1561 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1562   do {                                                                  \
1563     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1564       {                                                                 \
1565         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1566           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1567         else                                                            \
1568           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1569         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1570         break;                                                          \
1571       }                                                                 \
1572     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1573       {                                                                 \
1574         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1575         break;                                                          \
1576       }                                                                 \
1577     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1578       {                                                                 \
1579         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1580         break;                                                          \
1581       }                                                                 \
1582     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1583              && !coding->safe_charsets[charset])                        \
1584       {                                                                 \
1585         /* We should not encode this character, instead produce one or  \
1586            two `?'s.  */                                                \
1587         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1588         if (CHARSET_WIDTH (charset) == 2)                               \
1589           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1590         break;                                                          \
1591       }                                                                 \
1592     else                                                                \
1593       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1594          must invoke it, or, at first, designate it to some graphic     \
1595          register.  Then repeat the loop to actually produce the        \
1596          character.  */                                                 \
1597       dst = encode_invocation_designation (charset, coding, dst);       \
1598   } while (1)
1599
1600 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                   \
1601   do {                                                          \
1602     int c_alt, charset_alt;                                     \
1603     if (!NILP (translation_table)                               \
1604         && ((c_alt = translate_char (translation_table, -1,     \
1605                                      charset, c1, c2))          \
1606             >= 0))                                              \
1607       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                  \
1608     else                                                        \
1609       charset_alt = charset;                                    \
1610     if (CHARSET_DIMENSION (charset_alt) == 1)                   \
1611       {                                                         \
1612         if (charset == CHARSET_ASCII                            \
1613             && coding->flags & CODING_FLAG_ISO_USE_ROMAN)       \
1614           charset_alt = charset_latin_jisx0201;                 \
1615         ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);      \
1616       }                                                         \
1617     else                                                        \
1618       {                                                         \
1619         if (charset == charset_jisx0208                         \
1620             && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)      \
1621           charset_alt = charset_jisx0208_1978;                  \
1622         ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);  \
1623       }                                                         \
1624     if (! COMPOSING_P (coding->composing))                      \
1625       coding->consumed_char++;                                  \
1626   } while (0)
1627
1628 /* Produce designation and invocation codes at a place pointed by DST
1629    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1630    Return new DST.  */
1631
1632 unsigned char *
1633 encode_invocation_designation (charset, coding, dst)
1634      int charset;
1635      struct coding_system *coding;
1636      unsigned char *dst;
1637 {
1638   int reg;                      /* graphic register number */
1639
1640   /* At first, check designations.  */
1641   for (reg = 0; reg < 4; reg++)
1642     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1643       break;
1644
1645   if (reg >= 4)
1646     {
1647       /* CHARSET is not yet designated to any graphic registers.  */
1648       /* At first check the requested designation.  */
1649       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1650       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1651         /* Since CHARSET requests no special designation, designate it
1652            to graphic register 0.  */
1653         reg = 0;
1654
1655       ENCODE_DESIGNATION (charset, reg, coding);
1656     }
1657
1658   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1659       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1660     {
1661       /* Since the graphic register REG is not invoked to any graphic
1662          planes, invoke it to graphic plane 0.  */
1663       switch (reg)
1664         {
1665         case 0:                 /* graphic register 0 */
1666           ENCODE_SHIFT_IN;
1667           break;
1668
1669         case 1:                 /* graphic register 1 */
1670           ENCODE_SHIFT_OUT;
1671           break;
1672
1673         case 2:                 /* graphic register 2 */
1674           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1675             ENCODE_SINGLE_SHIFT_2;
1676           else
1677             ENCODE_LOCKING_SHIFT_2;
1678           break;
1679
1680         case 3:                 /* graphic register 3 */
1681           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1682             ENCODE_SINGLE_SHIFT_3;
1683           else
1684             ENCODE_LOCKING_SHIFT_3;
1685           break;
1686         }
1687     }
1688   return dst;
1689 }
1690
1691 /* The following two macros produce codes for indicating composition.  */
1692 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1693 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1694 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1695
1696 /* The following three macros produce codes for indicating direction
1697    of text.  */
1698 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1699   do {                                                  \
1700     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1701       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1702     else                                                \
1703       *dst++ = ISO_CODE_CSI;                            \
1704   } while (0)
1705
1706 #define ENCODE_DIRECTION_R2L    \
1707   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1708
1709 #define ENCODE_DIRECTION_L2R    \
1710   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1711
1712 /* Produce codes for designation and invocation to reset the graphic
1713    planes and registers to initial state.  */
1714 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1715   do {                                                                      \
1716     int reg;                                                                \
1717     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1718       ENCODE_SHIFT_IN;                                                      \
1719     for (reg = 0; reg < 4; reg++)                                           \
1720       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1721           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1722               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1723         ENCODE_DESIGNATION                                                  \
1724           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1725   } while (0)
1726
1727 /* Produce designation sequences of charsets in the line started from
1728    SRC to a place pointed by *DSTP, and update DSTP.
1729
1730    If the current block ends before any end-of-line, we may fail to
1731    find all the necessary designations.  */
1732
1733 void
1734 encode_designation_at_bol (coding, table, src, src_end, dstp)
1735      struct coding_system *coding;
1736      Lisp_Object table;
1737      unsigned char *src, *src_end, **dstp;
1738 {
1739   int charset, c, found = 0, reg;
1740   /* Table of charsets to be designated to each graphic register.  */
1741   int r[4];
1742   unsigned char *dst = *dstp;
1743
1744   for (reg = 0; reg < 4; reg++)
1745     r[reg] = -1;
1746
1747   while (src < src_end && *src != '\n' && found < 4)
1748     {
1749       int bytes = BYTES_BY_CHAR_HEAD (*src);
1750
1751       if (NILP (table))
1752         charset = CHARSET_AT (src);
1753       else
1754         {
1755           int c_alt;
1756           unsigned char c1, c2;
1757
1758           SPLIT_STRING(src, bytes, charset, c1, c2);
1759           if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1760             charset = CHAR_CHARSET (c_alt);
1761         }
1762
1763       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1764       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1765         {
1766           found++;
1767           r[reg] = charset;
1768         }
1769
1770       src += bytes;
1771     }
1772
1773   if (found)
1774     {
1775       for (reg = 0; reg < 4; reg++)
1776         if (r[reg] >= 0
1777             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1778           ENCODE_DESIGNATION (r[reg], reg, coding);
1779       *dstp = dst;
1780     }
1781 }
1782
1783 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1784
1785 int
1786 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1787      struct coding_system *coding;
1788      unsigned char *source, *destination;
1789      int src_bytes, dst_bytes;
1790 {
1791   unsigned char *src = source;
1792   unsigned char *src_end = source + src_bytes;
1793   unsigned char *dst = destination;
1794   unsigned char *dst_end = destination + dst_bytes;
1795   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1796      from DST_END to assure overflow checking is necessary only at the
1797      head of loop.  */
1798   unsigned char *adjusted_dst_end = dst_end - 19;
1799   Lisp_Object translation_table
1800       = coding->translation_table_for_encode;
1801   int result = CODING_FINISH_NORMAL;
1802
1803   if (!NILP (Venable_character_translation) && NILP (translation_table))
1804     translation_table = Vstandard_translation_table_for_encode;
1805
1806   coding->consumed_char = 0;
1807   coding->fake_multibyte = 0;
1808   while (src < src_end && (dst_bytes
1809                            ? (dst < adjusted_dst_end)
1810                            : (dst < src - 19)))
1811     {
1812       /* SRC_BASE remembers the start position in source in each loop.
1813          The loop will be exited when there's not enough source text
1814          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1815          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1816          reset to SRC_BASE before exiting.  */
1817       unsigned char *src_base = src;
1818       int charset, c1, c2, c3, c4;
1819
1820       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1821           && CODING_SPEC_ISO_BOL (coding))
1822         {
1823           /* We have to produce designation sequences if any now.  */
1824           encode_designation_at_bol (coding, translation_table,
1825                                      src, src_end, &dst);
1826           CODING_SPEC_ISO_BOL (coding) = 0;
1827         }
1828
1829       c1 = *src++;
1830       /* If we are seeing a component of a composite character, we are
1831          seeing a leading-code encoded irregularly for composition, or
1832          a composition rule if composing with rule.  We must set C1 to
1833          a normal leading-code or an ASCII code.  If we are not seeing
1834          a composite character, we must reset composition,
1835          designation, and invocation states.  */
1836       if (COMPOSING_P (coding->composing))
1837         {
1838           if (c1 < 0xA0)
1839             {
1840               /* We are not in a composite character any longer.  */
1841               coding->composing = COMPOSING_NO;
1842               ENCODE_RESET_PLANE_AND_REGISTER;
1843               ENCODE_COMPOSITION_END;
1844             }
1845           else
1846             {
1847               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1848                 {
1849                   *dst++ = c1 & 0x7F;
1850                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1851                   continue;
1852                 }
1853               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1854                 coding->composing = COMPOSING_WITH_RULE_RULE;
1855               if (c1 == 0xA0)
1856                 {
1857                   /* This is an ASCII component.  */
1858                   ONE_MORE_BYTE (c1);
1859                   c1 &= 0x7F;
1860                 }
1861               else
1862                 /* This is a leading-code of non ASCII component.  */
1863                 c1 -= 0x20;
1864             }
1865         }
1866
1867       /* Now encode one character.  C1 is a control character, an
1868          ASCII character, or a leading-code of multi-byte character.  */
1869       switch (emacs_code_class[c1])
1870         {
1871         case EMACS_ascii_code:
1872           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1873           break;
1874
1875         case EMACS_control_code:
1876           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1877             ENCODE_RESET_PLANE_AND_REGISTER;
1878           *dst++ = c1;
1879           coding->consumed_char++;
1880           break;
1881
1882         case EMACS_carriage_return_code:
1883           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1884             {
1885               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1886                 ENCODE_RESET_PLANE_AND_REGISTER;
1887               *dst++ = c1;
1888               coding->consumed_char++;
1889               break;
1890             }
1891           /* fall down to treat '\r' as '\n' ...  */
1892
1893         case EMACS_linefeed_code:
1894           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1895             ENCODE_RESET_PLANE_AND_REGISTER;
1896           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1897             bcopy (coding->spec.iso2022.initial_designation,
1898                    coding->spec.iso2022.current_designation,
1899                    sizeof coding->spec.iso2022.initial_designation);
1900           if (coding->eol_type == CODING_EOL_LF
1901               || coding->eol_type == CODING_EOL_UNDECIDED)
1902             *dst++ = ISO_CODE_LF;
1903           else if (coding->eol_type == CODING_EOL_CRLF)
1904             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1905           else
1906             *dst++ = ISO_CODE_CR;
1907           CODING_SPEC_ISO_BOL (coding) = 1;
1908           coding->consumed_char++;
1909           break;
1910
1911         case EMACS_leading_code_2:
1912           ONE_MORE_BYTE (c2);
1913           if (c2 < 0xA0)
1914             {
1915               /* invalid sequence */
1916               *dst++ = c1;
1917               src--;
1918               coding->consumed_char++;
1919             }
1920           else
1921             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1922           break;
1923
1924         case EMACS_leading_code_3:
1925           TWO_MORE_BYTES (c2, c3);
1926           if (c2 < 0xA0 || c3 < 0xA0)
1927             {
1928               /* invalid sequence */
1929               *dst++ = c1;
1930               src -= 2;
1931               coding->consumed_char++;
1932             }
1933           else if (c1 < LEADING_CODE_PRIVATE_11)
1934             ENCODE_ISO_CHARACTER (c1, c2, c3);
1935           else
1936             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1937           break;
1938
1939         case EMACS_leading_code_4:
1940           THREE_MORE_BYTES (c2, c3, c4);
1941           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1942             {
1943               /* invalid sequence */
1944               *dst++ = c1;
1945               src -= 3;
1946               coding->consumed_char++;
1947             }
1948           else
1949             ENCODE_ISO_CHARACTER (c2, c3, c4);
1950           break;
1951
1952         case EMACS_leading_code_composition:
1953           ONE_MORE_BYTE (c2);
1954           if (c2 < 0xA0)
1955             {
1956               /* invalid sequence */
1957               *dst++ = c1;
1958               src--;
1959               coding->consumed_char++;
1960             }
1961           else if (c2 == 0xFF)
1962             {
1963               ENCODE_RESET_PLANE_AND_REGISTER;
1964               coding->composing = COMPOSING_WITH_RULE_HEAD;
1965               ENCODE_COMPOSITION_WITH_RULE_START;
1966               coding->consumed_char++;
1967             }
1968           else
1969             {
1970               ENCODE_RESET_PLANE_AND_REGISTER;
1971               /* Rewind one byte because it is a character code of
1972                  composition elements.  */
1973               src--;
1974               coding->composing = COMPOSING_NO_RULE_HEAD;
1975               ENCODE_COMPOSITION_NO_RULE_START;
1976               coding->consumed_char++;
1977             }
1978           break;
1979
1980         case EMACS_invalid_code:
1981           *dst++ = c1;
1982           coding->consumed_char++;
1983           break;
1984         }
1985       continue;
1986     label_end_of_loop:
1987       result = CODING_FINISH_INSUFFICIENT_SRC;
1988       src = src_base;
1989       break;
1990     }
1991
1992   if (src < src_end && result == CODING_FINISH_NORMAL)
1993     result = CODING_FINISH_INSUFFICIENT_DST;
1994
1995   /* If this is the last block of the text to be encoded, we must
1996      reset graphic planes and registers to the initial state, and
1997      flush out the carryover if any.  */
1998   if (coding->mode & CODING_MODE_LAST_BLOCK)
1999     {
2000       ENCODE_RESET_PLANE_AND_REGISTER;
2001       if (COMPOSING_P (coding->composing))
2002         ENCODE_COMPOSITION_END;
2003       if (result == CODING_FINISH_INSUFFICIENT_SRC)
2004         {
2005           while (src < src_end && dst < dst_end)
2006             *dst++ = *src++;
2007         }
2008     }
2009   coding->consumed = src - source;
2010   coding->produced = coding->produced_char = dst - destination;
2011   return result;
2012 }
2013
2014 \f
2015 /*** 4. SJIS and BIG5 handlers ***/
2016
2017 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2018    quite widely.  So, for the moment, Emacs supports them in the bare
2019    C code.  But, in the future, they may be supported only by CCL.  */
2020
2021 /* SJIS is a coding system encoding three character sets: ASCII, right
2022    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2023    as is.  A character of charset katakana-jisx0201 is encoded by
2024    "position-code + 0x80".  A character of charset japanese-jisx0208
2025    is encoded in 2-byte but two position-codes are divided and shifted
2026    so that it fit in the range below.
2027
2028    --- CODE RANGE of SJIS ---
2029    (character set)      (range)
2030    ASCII                0x00 .. 0x7F
2031    KATAKANA-JISX0201    0xA0 .. 0xDF
2032    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2033             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2034    -------------------------------
2035
2036 */
2037
2038 /* BIG5 is a coding system encoding two character sets: ASCII and
2039    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2040    character set and is encoded in two-byte.
2041
2042    --- CODE RANGE of BIG5 ---
2043    (character set)      (range)
2044    ASCII                0x00 .. 0x7F
2045    Big5 (1st byte)      0xA1 .. 0xFE
2046         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2047    --------------------------
2048
2049    Since the number of characters in Big5 is larger than maximum
2050    characters in Emacs' charset (96x96), it can't be handled as one
2051    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2052    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2053    contains frequently used characters and the latter contains less
2054    frequently used characters.  */
2055
2056 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2057    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2058    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2059    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2060
2061 /* Number of Big5 characters which have the same code in 1st byte.  */
2062 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2063
2064 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2065   do {                                                                  \
2066     unsigned int temp                                                   \
2067       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2068     if (b1 < 0xC9)                                                      \
2069       charset = charset_big5_1;                                         \
2070     else                                                                \
2071       {                                                                 \
2072         charset = charset_big5_2;                                       \
2073         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2074       }                                                                 \
2075     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2076     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2077   } while (0)
2078
2079 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2080   do {                                                                  \
2081     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2082     if (charset == charset_big5_2)                                      \
2083       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2084     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2085     b2 = temp % BIG5_SAME_ROW;                                          \
2086     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2087   } while (0)
2088
2089 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
2090   do {                                                                  \
2091     int c_alt, charset_alt = (charset);                                 \
2092     if (!NILP (translation_table)                                       \
2093         && ((c_alt = translate_char (translation_table,                 \
2094                                      -1, (charset), c1, c2)) >= 0))     \
2095           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
2096     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
2097       DECODE_CHARACTER_ASCII (c1);                                      \
2098     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
2099       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
2100     else                                                                \
2101       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
2102   } while (0)
2103
2104 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)             \
2105   do {                                                          \
2106     int c_alt, charset_alt;                                     \
2107     if (!NILP (translation_table)                               \
2108         && ((c_alt = translate_char (translation_table, -1,     \
2109                                      charset, c1, c2))          \
2110             >= 0))                                              \
2111       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                  \
2112     else                                                        \
2113       charset_alt = charset;                                    \
2114     if (charset_alt == charset_ascii)                           \
2115       *dst++ = c1;                                              \
2116     else if (CHARSET_DIMENSION (charset_alt) == 1)              \
2117       {                                                         \
2118         if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2119           *dst++ = c1;                                          \
2120         else                                                    \
2121           {                                                     \
2122             *dst++ = charset_alt, *dst++ = c1;                  \
2123             coding->fake_multibyte = 1;                         \
2124           }                                                     \
2125       }                                                         \
2126     else                                                        \
2127       {                                                         \
2128         c1 &= 0x7F, c2 &= 0x7F;                                 \
2129         if (sjis_p && charset_alt == charset_jisx0208)          \
2130           {                                                     \
2131             unsigned char s1, s2;                               \
2132                                                                 \
2133             ENCODE_SJIS (c1, c2, s1, s2);                       \
2134             *dst++ = s1, *dst++ = s2;                           \
2135             coding->fake_multibyte = 1;                         \
2136           }                                                     \
2137         else if (!sjis_p                                        \
2138                  && (charset_alt == charset_big5_1              \
2139                      || charset_alt == charset_big5_2))         \
2140           {                                                     \
2141             unsigned char b1, b2;                               \
2142                                                                 \
2143             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);          \
2144             *dst++ = b1, *dst++ = b2;                           \
2145           }                                                     \
2146         else                                                    \
2147           {                                                     \
2148             *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;     \
2149             coding->fake_multibyte = 1;                         \
2150           }                                                     \
2151       }                                                         \
2152     coding->consumed_char++;                                    \
2153   } while (0);
2154
2155 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2156    Check if a text is encoded in SJIS.  If it is, return
2157    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2158
2159 int
2160 detect_coding_sjis (src, src_end)
2161      unsigned char *src, *src_end;
2162 {
2163   unsigned char c;
2164
2165   while (src < src_end)
2166     {
2167       c = *src++;
2168       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2169         {
2170           if (src < src_end && *src++ < 0x40)
2171             return 0;
2172         }
2173     }
2174   return CODING_CATEGORY_MASK_SJIS;
2175 }
2176
2177 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2178    Check if a text is encoded in BIG5.  If it is, return
2179    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2180
2181 int
2182 detect_coding_big5 (src, src_end)
2183      unsigned char *src, *src_end;
2184 {
2185   unsigned char c;
2186
2187   while (src < src_end)
2188     {
2189       c = *src++;
2190       if (c >= 0xA1)
2191         {
2192           if (src >= src_end)
2193             break;
2194           c = *src++;
2195           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2196             return 0;
2197         }
2198     }
2199   return CODING_CATEGORY_MASK_BIG5;
2200 }
2201
2202 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2203    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2204
2205 int
2206 decode_coding_sjis_big5 (coding, source, destination,
2207                          src_bytes, dst_bytes, sjis_p)
2208      struct coding_system *coding;
2209      unsigned char *source, *destination;
2210      int src_bytes, dst_bytes;
2211      int sjis_p;
2212 {
2213   unsigned char *src = source;
2214   unsigned char *src_end = source + src_bytes;
2215   unsigned char *dst = destination;
2216   unsigned char *dst_end = destination + dst_bytes;
2217   /* Since the maximum bytes produced by each loop is 4, we subtract 3
2218      from DST_END to assure overflow checking is necessary only at the
2219      head of loop.  */
2220   unsigned char *adjusted_dst_end = dst_end - 3;
2221   Lisp_Object translation_table
2222       = coding->translation_table_for_decode;
2223   int result = CODING_FINISH_NORMAL;
2224
2225   if (!NILP (Venable_character_translation) && NILP (translation_table))
2226     translation_table = Vstandard_translation_table_for_decode;
2227
2228   coding->produced_char = 0;
2229   coding->fake_multibyte = 0;
2230   while (src < src_end && (dst_bytes
2231                            ? (dst < adjusted_dst_end)
2232                            : (dst < src - 3)))
2233     {
2234       /* SRC_BASE remembers the start position in source in each loop.
2235          The loop will be exited when there's not enough source text
2236          to analyze two-byte character (within macro ONE_MORE_BYTE).
2237          In that case, SRC is reset to SRC_BASE before exiting.  */
2238       unsigned char *src_base = src;
2239       unsigned char c1 = *src++, c2, c3, c4;
2240
2241       if (c1 < 0x20)
2242         {
2243           if (c1 == '\r')
2244             {
2245               if (coding->eol_type == CODING_EOL_CRLF)
2246                 {
2247                   ONE_MORE_BYTE (c2);
2248                   if (c2 == '\n')
2249                     *dst++ = c2;
2250                   else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2251                     {
2252                       result = CODING_FINISH_INCONSISTENT_EOL;
2253                       goto label_end_of_loop_2;
2254                     }
2255                   else
2256                     /* To process C2 again, SRC is subtracted by 1.  */
2257                     *dst++ = c1, src--;
2258                 }
2259               else if (coding->eol_type == CODING_EOL_CR)
2260                 *dst++ = '\n';
2261               else
2262                 *dst++ = c1;
2263             }
2264           else if (c1 == '\n'
2265                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2266                    && (coding->eol_type == CODING_EOL_CR
2267                        || coding->eol_type == CODING_EOL_CRLF))
2268             {
2269               result = CODING_FINISH_INCONSISTENT_EOL;
2270               goto label_end_of_loop_2;
2271             }
2272           else
2273             *dst++ = c1;
2274           coding->produced_char++;
2275         }
2276       else if (c1 < 0x80)
2277         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2278       else
2279         {
2280           if (sjis_p)
2281             {
2282               if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2283                 {
2284                   /* SJIS -> JISX0208 */
2285                   ONE_MORE_BYTE (c2);
2286                   if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2287                     {
2288                       DECODE_SJIS (c1, c2, c3, c4);
2289                       DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2290                     }
2291                   else
2292                     goto label_invalid_code_2;
2293                 }
2294               else if (c1 < 0xE0)
2295                 /* SJIS -> JISX0201-Kana */
2296                 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2297                                             /* dummy */ c2);
2298               else
2299                 goto label_invalid_code_1;
2300             }
2301           else
2302             {
2303               /* BIG5 -> Big5 */
2304               if (c1 >= 0xA1 && c1 <= 0xFE)
2305                 {
2306                   ONE_MORE_BYTE (c2);
2307                   if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2308                     {
2309                       int charset;
2310
2311                       DECODE_BIG5 (c1, c2, charset, c3, c4);
2312                       DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2313                     }
2314                   else
2315                     goto label_invalid_code_2;
2316                 }
2317               else
2318                 goto label_invalid_code_1;
2319             }
2320         }
2321       continue;
2322
2323     label_invalid_code_1:
2324       *dst++ = c1;
2325       coding->produced_char++;
2326       coding->fake_multibyte = 1;
2327       continue;
2328
2329     label_invalid_code_2:
2330       *dst++ = c1; *dst++= c2;
2331       coding->produced_char += 2;
2332       coding->fake_multibyte = 1;
2333       continue;
2334
2335     label_end_of_loop:
2336       result = CODING_FINISH_INSUFFICIENT_SRC;
2337     label_end_of_loop_2:
2338       src = src_base;
2339       break;
2340     }
2341
2342   if (src < src_end)
2343     {
2344       if (result == CODING_FINISH_NORMAL)
2345         result = CODING_FINISH_INSUFFICIENT_DST;
2346       else if (result != CODING_FINISH_INCONSISTENT_EOL
2347                && coding->mode & CODING_MODE_LAST_BLOCK)
2348         {
2349           src_bytes = src_end - src;
2350           if (dst_bytes && (dst_end - dst < src_bytes))
2351             src_bytes = dst_end - dst;
2352           bcopy (dst, src, src_bytes);
2353           src += src_bytes;
2354           dst += src_bytes;
2355           coding->fake_multibyte = 1;
2356         }
2357     }
2358
2359   coding->consumed = coding->consumed_char = src - source;
2360   coding->produced = dst - destination;
2361   return result;
2362 }
2363
2364 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2365    This function can encode `charset_ascii', `charset_katakana_jisx0201',
2366    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
2367    sure that all these charsets are registered as official charset
2368    (i.e. do not have extended leading-codes).  Characters of other
2369    charsets are produced without any encoding.  If SJIS_P is 1, encode
2370    SJIS text, else encode BIG5 text.  */
2371
2372 int
2373 encode_coding_sjis_big5 (coding, source, destination,
2374                          src_bytes, dst_bytes, sjis_p)
2375      struct coding_system *coding;
2376      unsigned char *source, *destination;
2377      int src_bytes, dst_bytes;
2378      int sjis_p;
2379 {
2380   unsigned char *src = source;
2381   unsigned char *src_end = source + src_bytes;
2382   unsigned char *dst = destination;
2383   unsigned char *dst_end = destination + dst_bytes;
2384   /* Since the maximum bytes produced by each loop is 2, we subtract 1
2385      from DST_END to assure overflow checking is necessary only at the
2386      head of loop.  */
2387   unsigned char *adjusted_dst_end = dst_end - 1;
2388   Lisp_Object translation_table
2389       = coding->translation_table_for_encode;
2390   int result = CODING_FINISH_NORMAL;
2391
2392   if (!NILP (Venable_character_translation) && NILP (translation_table))
2393     translation_table = Vstandard_translation_table_for_encode;
2394
2395   coding->consumed_char = 0;
2396   coding->fake_multibyte = 0;
2397   while (src < src_end && (dst_bytes
2398                            ? (dst < adjusted_dst_end)
2399                            : (dst < src - 1)))
2400     {
2401       /* SRC_BASE remembers the start position in source in each loop.
2402          The loop will be exited when there's not enough source text
2403          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2404          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2405          before exiting.  */
2406       unsigned char *src_base = src;
2407       unsigned char c1 = *src++, c2, c3, c4;
2408
2409       if (coding->composing)
2410         {
2411           if (c1 == 0xA0)
2412             {
2413               ONE_MORE_BYTE (c1);
2414               c1 &= 0x7F;
2415             }
2416           else if (c1 >= 0xA0)
2417             c1 -= 0x20;
2418           else
2419             coding->composing = 0;
2420         }
2421
2422       switch (emacs_code_class[c1])
2423         {
2424         case EMACS_ascii_code:
2425           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2426           break;
2427
2428         case EMACS_control_code:
2429           *dst++ = c1;
2430           coding->consumed_char++;
2431           break;
2432
2433         case EMACS_carriage_return_code:
2434           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2435             {
2436               *dst++ = c1;
2437               coding->consumed_char++;
2438               break;
2439             }
2440           /* fall down to treat '\r' as '\n' ...  */
2441
2442         case EMACS_linefeed_code:
2443           if (coding->eol_type == CODING_EOL_LF
2444               || coding->eol_type == CODING_EOL_UNDECIDED)
2445             *dst++ = '\n';
2446           else if (coding->eol_type == CODING_EOL_CRLF)
2447             *dst++ = '\r', *dst++ = '\n';
2448           else
2449             *dst++ = '\r';
2450           coding->consumed_char++;
2451           break;
2452
2453         case EMACS_leading_code_2:
2454           ONE_MORE_BYTE (c2);
2455           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2456           break;
2457
2458         case EMACS_leading_code_3:
2459           TWO_MORE_BYTES (c2, c3);
2460           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2461           break;
2462
2463         case EMACS_leading_code_4:
2464           THREE_MORE_BYTES (c2, c3, c4);
2465           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2466           break;
2467
2468         case EMACS_leading_code_composition:
2469           coding->composing = 1;
2470           break;
2471
2472         default:                /* i.e. case EMACS_invalid_code: */
2473           *dst++ = c1;
2474           coding->consumed_char++;
2475         }
2476       continue;
2477
2478     label_end_of_loop:
2479       result = CODING_FINISH_INSUFFICIENT_SRC;
2480       src = src_base;
2481       break;
2482     }
2483
2484   if (result == CODING_FINISH_NORMAL
2485       && src < src_end)
2486     result = CODING_FINISH_INSUFFICIENT_DST;
2487   coding->consumed = src - source;
2488   coding->produced = coding->produced_char = dst - destination;
2489   return result;
2490 }
2491
2492 \f
2493 /*** 5. CCL handlers ***/
2494
2495 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2496    Check if a text is encoded in a coding system of which
2497    encoder/decoder are written in CCL program.  If it is, return
2498    CODING_CATEGORY_MASK_CCL, else return 0.  */
2499
2500 int
2501 detect_coding_ccl (src, src_end)
2502      unsigned char *src, *src_end;
2503 {
2504   unsigned char *valid;
2505
2506   /* No coding system is assigned to coding-category-ccl.  */
2507   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2508     return 0;
2509
2510   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2511   while (src < src_end)
2512     {
2513       if (! valid[*src]) return 0;
2514       src++;
2515     }
2516   return CODING_CATEGORY_MASK_CCL;
2517 }
2518
2519 \f
2520 /*** 6. End-of-line handlers ***/
2521
2522 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2523    This function is called only when `coding->eol_type' is
2524    CODING_EOL_CRLF or CODING_EOL_CR.  */
2525
2526 int
2527 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2528      struct coding_system *coding;
2529      unsigned char *source, *destination;
2530      int src_bytes, dst_bytes;
2531 {
2532   unsigned char *src = source;
2533   unsigned char *src_end = source + src_bytes;
2534   unsigned char *dst = destination;
2535   unsigned char *dst_end = destination + dst_bytes;
2536   unsigned char c;
2537   int result = CODING_FINISH_NORMAL;
2538
2539   coding->fake_multibyte = 0;
2540
2541   if (src_bytes <= 0)
2542     return result;
2543
2544   switch (coding->eol_type)
2545     {
2546     case CODING_EOL_CRLF:
2547       {
2548         /* Since the maximum bytes produced by each loop is 2, we
2549            subtract 1 from DST_END to assure overflow checking is
2550            necessary only at the head of loop.  */
2551         unsigned char *adjusted_dst_end = dst_end - 1;
2552
2553         while (src < src_end && (dst_bytes
2554                                  ? (dst < adjusted_dst_end)
2555                                  : (dst < src - 1)))
2556           {
2557             unsigned char *src_base = src;
2558
2559             c = *src++;
2560             if (c == '\r')
2561               {
2562                 ONE_MORE_BYTE (c);
2563                 if (c == '\n')
2564                   *dst++ = c;
2565                 else
2566                   {
2567                     if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2568                       {
2569                         result = CODING_FINISH_INCONSISTENT_EOL;
2570                         goto label_end_of_loop_2;
2571                       }
2572                     src--;
2573                     *dst++ = '\r';
2574                     if (BASE_LEADING_CODE_P (c))
2575                       coding->fake_multibyte = 1;
2576                   }
2577               }
2578             else if (c == '\n'
2579                      && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2580               {
2581                 result = CODING_FINISH_INCONSISTENT_EOL;
2582                 goto label_end_of_loop_2;
2583               }
2584             else
2585               {
2586                 *dst++ = c;
2587                 if (BASE_LEADING_CODE_P (c))
2588                   coding->fake_multibyte = 1;
2589               }
2590             continue;
2591
2592           label_end_of_loop:
2593             result = CODING_FINISH_INSUFFICIENT_SRC;
2594           label_end_of_loop_2:
2595             src = src_base;
2596             break;
2597           }
2598         if (src < src_end)
2599           {
2600             if (result == CODING_FINISH_NORMAL)
2601               result = CODING_FINISH_INSUFFICIENT_DST;
2602             else if (result != CODING_FINISH_INCONSISTENT_EOL
2603                      && coding->mode & CODING_MODE_LAST_BLOCK)
2604               {
2605                 /* This is the last block of the text to be decoded.
2606                    We flush out all remaining codes.  */
2607                 src_bytes = src_end - src;
2608                 if (dst_bytes && (dst_end - dst < src_bytes))
2609                   src_bytes = dst_end - dst;
2610                 bcopy (src, dst, src_bytes);
2611                 dst += src_bytes;
2612                 src += src_bytes;
2613               }
2614           }
2615       }
2616       break;
2617
2618     case CODING_EOL_CR:
2619       if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2620         {
2621           while (src < src_end)
2622             {
2623               if ((c = *src++) == '\n')
2624                 break;
2625               if (BASE_LEADING_CODE_P (c))
2626                 coding->fake_multibyte = 1;
2627             }
2628           if (*--src == '\n')
2629             {
2630               src_bytes = src - source;
2631               result = CODING_FINISH_INCONSISTENT_EOL;
2632             }
2633         }
2634       if (dst_bytes && src_bytes > dst_bytes)
2635         {
2636           result = CODING_FINISH_INSUFFICIENT_DST;
2637           src_bytes = dst_bytes;
2638         }
2639       if (dst_bytes)
2640         bcopy (source, destination, src_bytes);
2641       else
2642         safe_bcopy (source, destination, src_bytes);
2643       src = source + src_bytes;
2644       while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2645       break;
2646
2647     default:                    /* i.e. case: CODING_EOL_LF */
2648       if (dst_bytes && src_bytes > dst_bytes)
2649         {
2650           result = CODING_FINISH_INSUFFICIENT_DST;
2651           src_bytes = dst_bytes;
2652         }
2653       if (dst_bytes)
2654         bcopy (source, destination, src_bytes);
2655       else
2656         safe_bcopy (source, destination, src_bytes);
2657       src += src_bytes;
2658       dst += src_bytes;
2659       coding->fake_multibyte = 1;
2660       break;
2661     }
2662
2663   coding->consumed = coding->consumed_char = src - source;
2664   coding->produced = coding->produced_char = dst - destination;
2665   return result;
2666 }
2667
2668 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2669    format of end-of-line according to `coding->eol_type'.  If
2670    `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2671    '\r' in source text also means end-of-line.  */
2672
2673 int
2674 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2675      struct coding_system *coding;
2676      unsigned char *source, *destination;
2677      int src_bytes, dst_bytes;
2678 {
2679   unsigned char *src = source;
2680   unsigned char *dst = destination;
2681   int result = CODING_FINISH_NORMAL;
2682
2683   coding->fake_multibyte = 0;
2684
2685   if (coding->eol_type == CODING_EOL_CRLF)
2686     {
2687       unsigned char c;
2688       unsigned char *src_end = source + src_bytes;
2689       unsigned char *dst_end = destination + dst_bytes;
2690       /* Since the maximum bytes produced by each loop is 2, we
2691          subtract 1 from DST_END to assure overflow checking is
2692          necessary only at the head of loop.  */
2693       unsigned char *adjusted_dst_end = dst_end - 1;
2694
2695       while (src < src_end && (dst_bytes
2696                                ? (dst < adjusted_dst_end)
2697                                : (dst < src - 1)))
2698         {
2699           c = *src++;
2700           if (c == '\n'
2701               || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2702             *dst++ = '\r', *dst++ = '\n';
2703           else
2704             {
2705               *dst++ = c;
2706               if (BASE_LEADING_CODE_P (c))
2707                 coding->fake_multibyte = 1;
2708             }
2709         }
2710       if (src < src_end)
2711         result = CODING_FINISH_INSUFFICIENT_DST;
2712     }
2713   else
2714     {
2715       unsigned char c;
2716
2717       if (dst_bytes && src_bytes > dst_bytes)
2718         {
2719           src_bytes = dst_bytes;
2720           result = CODING_FINISH_INSUFFICIENT_DST;
2721         }
2722       if (dst_bytes)
2723         bcopy (source, destination, src_bytes);
2724       else
2725         safe_bcopy (source, destination, src_bytes);
2726       dst_bytes = src_bytes;
2727       if (coding->eol_type == CODING_EOL_CR)
2728         {
2729           while (src_bytes--)
2730             {
2731               if ((c = *dst++) == '\n')
2732                 dst[-1] = '\r';
2733               else if (BASE_LEADING_CODE_P (c))
2734                 coding->fake_multibyte = 1;
2735             }
2736         }
2737       else
2738         {
2739           if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2740             {
2741               while (src_bytes--)
2742                 if (*dst++ == '\r') dst[-1] = '\n';
2743             }
2744           coding->fake_multibyte = 1;
2745         }
2746       src = source + dst_bytes;
2747       dst = destination + dst_bytes;
2748     }
2749
2750   coding->consumed = coding->consumed_char = src - source;
2751   coding->produced = coding->produced_char = dst - destination;
2752   return result;
2753 }
2754
2755 \f
2756 /*** 7. C library functions ***/
2757
2758 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2759    has a property `coding-system'.  The value of this property is a
2760    vector of length 5 (called as coding-vector).  Among elements of
2761    this vector, the first (element[0]) and the fifth (element[4])
2762    carry important information for decoding/encoding.  Before
2763    decoding/encoding, this information should be set in fields of a
2764    structure of type `coding_system'.
2765
2766    A value of property `coding-system' can be a symbol of another
2767    subsidiary coding-system.  In that case, Emacs gets coding-vector
2768    from that symbol.
2769
2770    `element[0]' contains information to be set in `coding->type'.  The
2771    value and its meaning is as follows:
2772
2773    0 -- coding_type_emacs_mule
2774    1 -- coding_type_sjis
2775    2 -- coding_type_iso2022
2776    3 -- coding_type_big5
2777    4 -- coding_type_ccl encoder/decoder written in CCL
2778    nil -- coding_type_no_conversion
2779    t -- coding_type_undecided (automatic conversion on decoding,
2780                                no-conversion on encoding)
2781
2782    `element[4]' contains information to be set in `coding->flags' and
2783    `coding->spec'.  The meaning varies by `coding->type'.
2784
2785    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2786    of length 32 (of which the first 13 sub-elements are used now).
2787    Meanings of these sub-elements are:
2788
2789    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2790         If the value is an integer of valid charset, the charset is
2791         assumed to be designated to graphic register N initially.
2792
2793         If the value is minus, it is a minus value of charset which
2794         reserves graphic register N, which means that the charset is
2795         not designated initially but should be designated to graphic
2796         register N just before encoding a character in that charset.
2797
2798         If the value is nil, graphic register N is never used on
2799         encoding.
2800
2801    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2802         Each value takes t or nil.  See the section ISO2022 of
2803         `coding.h' for more information.
2804
2805    If `coding->type' is `coding_type_big5', element[4] is t to denote
2806    BIG5-ETen or nil to denote BIG5-HKU.
2807
2808    If `coding->type' takes the other value, element[4] is ignored.
2809
2810    Emacs Lisp's coding system also carries information about format of
2811    end-of-line in a value of property `eol-type'.  If the value is
2812    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2813    means CODING_EOL_CR.  If it is not integer, it should be a vector
2814    of subsidiary coding systems of which property `eol-type' has one
2815    of above values.
2816
2817 */
2818
2819 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2820    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2821    is setup so that no conversion is necessary and return -1, else
2822    return 0.  */
2823
2824 int
2825 setup_coding_system (coding_system, coding)
2826      Lisp_Object coding_system;
2827      struct coding_system *coding;
2828 {
2829   Lisp_Object coding_spec, coding_type, eol_type, plist;
2830   Lisp_Object val;
2831   int i;
2832
2833   /* Initialize some fields required for all kinds of coding systems.  */
2834   coding->symbol = coding_system;
2835   coding->common_flags = 0;
2836   coding->mode = 0;
2837   coding->heading_ascii = -1;
2838   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2839   coding_spec = Fget (coding_system, Qcoding_system);
2840   if (!VECTORP (coding_spec)
2841       || XVECTOR (coding_spec)->size != 5
2842       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2843     goto label_invalid_coding_system;
2844
2845   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2846   if (VECTORP (eol_type))
2847     {
2848       coding->eol_type = CODING_EOL_UNDECIDED;
2849       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2850     }
2851   else if (XFASTINT (eol_type) == 1)
2852     {
2853       coding->eol_type = CODING_EOL_CRLF;
2854       coding->common_flags
2855         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2856     }
2857   else if (XFASTINT (eol_type) == 2)
2858     {
2859       coding->eol_type = CODING_EOL_CR;
2860       coding->common_flags
2861         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2862     }
2863   else
2864     coding->eol_type = CODING_EOL_LF;
2865
2866   coding_type = XVECTOR (coding_spec)->contents[0];
2867   /* Try short cut.  */
2868   if (SYMBOLP (coding_type))
2869     {
2870       if (EQ (coding_type, Qt))
2871         {
2872           coding->type = coding_type_undecided;
2873           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2874         }
2875       else
2876         coding->type = coding_type_no_conversion;
2877       return 0;
2878     }
2879
2880   /* Initialize remaining fields.  */
2881   coding->composing = 0;
2882
2883   /* Get values of coding system properties:
2884      `post-read-conversion', `pre-write-conversion',
2885      `translation-table-for-decode', `translation-table-for-encode'.  */
2886   plist = XVECTOR (coding_spec)->contents[3];
2887   coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2888   coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2889   val = Fplist_get (plist, Qtranslation_table_for_decode);
2890   if (SYMBOLP (val))
2891     val = Fget (val, Qtranslation_table_for_decode);
2892   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2893   val = Fplist_get (plist, Qtranslation_table_for_encode);
2894   if (SYMBOLP (val))
2895     val = Fget (val, Qtranslation_table_for_encode);
2896   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2897   val = Fplist_get (plist, Qcoding_category);
2898   if (!NILP (val))
2899     {
2900       val = Fget (val, Qcoding_category_index);
2901       if (INTEGERP (val))
2902         coding->category_idx = XINT (val);
2903       else
2904         goto label_invalid_coding_system;
2905     }
2906   else
2907     goto label_invalid_coding_system;
2908
2909   val = Fplist_get (plist, Qsafe_charsets);
2910   if (EQ (val, Qt))
2911     {
2912       for (i = 0; i <= MAX_CHARSET; i++)
2913         coding->safe_charsets[i] = 1;
2914     }
2915   else
2916     {
2917       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2918       while (CONSP (val))
2919         {
2920           if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2921             coding->safe_charsets[i] = 1;
2922           val = XCONS (val)->cdr;
2923         }
2924     }
2925
2926   switch (XFASTINT (coding_type))
2927     {
2928     case 0:
2929       coding->type = coding_type_emacs_mule;
2930       if (!NILP (coding->post_read_conversion))
2931         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2932       if (!NILP (coding->pre_write_conversion))
2933         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2934       break;
2935
2936     case 1:
2937       coding->type = coding_type_sjis;
2938       coding->common_flags
2939         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2940       break;
2941
2942     case 2:
2943       coding->type = coding_type_iso2022;
2944       coding->common_flags
2945         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2946       {
2947         Lisp_Object val, temp;
2948         Lisp_Object *flags;
2949         int i, charset, reg_bits = 0;
2950
2951         val = XVECTOR (coding_spec)->contents[4];
2952
2953         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2954           goto label_invalid_coding_system;
2955
2956         flags = XVECTOR (val)->contents;
2957         coding->flags
2958           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2959              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2960              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2961              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2962              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2963              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2964              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2965              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2966              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2967              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2968              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2969              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2970              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2971              );
2972
2973         /* Invoke graphic register 0 to plane 0.  */
2974         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2975         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2976         CODING_SPEC_ISO_INVOCATION (coding, 1)
2977           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2978         /* Not single shifting at first.  */
2979         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2980         /* Beginning of buffer should also be regarded as bol. */
2981         CODING_SPEC_ISO_BOL (coding) = 1;
2982
2983         for (charset = 0; charset <= MAX_CHARSET; charset++)
2984           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2985         val = Vcharset_revision_alist;
2986         while (CONSP (val))
2987           {
2988             charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2989             if (charset >= 0
2990                 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2991                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2992               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2993             val = XCONS (val)->cdr;
2994           }
2995
2996         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2997            FLAGS[REG] can be one of below:
2998                 integer CHARSET: CHARSET occupies register I,
2999                 t: designate nothing to REG initially, but can be used
3000                   by any charsets,
3001                 list of integer, nil, or t: designate the first
3002                   element (if integer) to REG initially, the remaining
3003                   elements (if integer) is designated to REG on request,
3004                   if an element is t, REG can be used by any charsets,
3005                 nil: REG is never used.  */
3006         for (charset = 0; charset <= MAX_CHARSET; charset++)
3007           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3008             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3009         for (i = 0; i < 4; i++)
3010           {
3011             if (INTEGERP (flags[i])
3012                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3013                 || (charset = get_charset_id (flags[i])) >= 0)
3014               {
3015                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3016                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3017               }
3018             else if (EQ (flags[i], Qt))
3019               {
3020                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3021                 reg_bits |= 1 << i;
3022                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3023               }
3024             else if (CONSP (flags[i]))
3025               {
3026                 Lisp_Object tail;
3027                 tail = flags[i];
3028
3029                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3030                 if (INTEGERP (XCONS (tail)->car)
3031                     && (charset = XINT (XCONS (tail)->car),
3032                         CHARSET_VALID_P (charset))
3033                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
3034                   {
3035                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3036                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3037                   }
3038                 else
3039                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3040                 tail = XCONS (tail)->cdr;
3041                 while (CONSP (tail))
3042                   {
3043                     if (INTEGERP (XCONS (tail)->car)
3044                         && (charset = XINT (XCONS (tail)->car),
3045                             CHARSET_VALID_P (charset))
3046                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
3047                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3048                         = i;
3049                     else if (EQ (XCONS (tail)->car, Qt))
3050                       reg_bits |= 1 << i;
3051                     tail = XCONS (tail)->cdr;
3052                   }
3053               }
3054             else
3055               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3056
3057             CODING_SPEC_ISO_DESIGNATION (coding, i)
3058               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3059           }
3060
3061         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3062           {
3063             /* REG 1 can be used only by locking shift in 7-bit env.  */
3064             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3065               reg_bits &= ~2;
3066             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3067               /* Without any shifting, only REG 0 and 1 can be used.  */
3068               reg_bits &= 3;
3069           }
3070
3071         if (reg_bits)
3072           for (charset = 0; charset <= MAX_CHARSET; charset++)
3073             {
3074               if (CHARSET_VALID_P (charset))
3075                 {
3076                   /* There exist some default graphic registers to be
3077                      used CHARSET.  */
3078
3079                   /* We had better avoid designating a charset of
3080                      CHARS96 to REG 0 as far as possible.  */
3081                   if (CHARSET_CHARS (charset) == 96)
3082                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3083                       = (reg_bits & 2
3084                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3085                   else
3086                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3087                       = (reg_bits & 1
3088                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3089                 }
3090             }
3091       }
3092       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3093       coding->spec.iso2022.last_invalid_designation_register = -1;
3094       break;
3095
3096     case 3:
3097       coding->type = coding_type_big5;
3098       coding->common_flags
3099         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3100       coding->flags
3101         = (NILP (XVECTOR (coding_spec)->contents[4])
3102            ? CODING_FLAG_BIG5_HKU
3103            : CODING_FLAG_BIG5_ETEN);
3104       break;
3105
3106     case 4:
3107       coding->type = coding_type_ccl;
3108       coding->common_flags
3109         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3110       {
3111         Lisp_Object val;
3112         Lisp_Object decoder, encoder;
3113
3114         val = XVECTOR (coding_spec)->contents[4];
3115         if (CONSP  (val)
3116             && SYMBOLP (XCONS (val)->car)
3117             && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
3118             && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
3119             && SYMBOLP (XCONS (val)->cdr)
3120             && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
3121             && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
3122           {
3123             setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3124             setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
3125           }
3126         else
3127           goto label_invalid_coding_system;
3128
3129         bzero (coding->spec.ccl.valid_codes, 256);
3130         val = Fplist_get (plist, Qvalid_codes);
3131         if (CONSP (val))
3132           {
3133             Lisp_Object this;
3134
3135             for (; CONSP (val); val = XCONS (val)->cdr)
3136               {
3137                 this = XCONS (val)->car;
3138                 if (INTEGERP (this)
3139                     && XINT (this) >= 0 && XINT (this) < 256)
3140                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3141                 else if (CONSP (this)
3142                          && INTEGERP (XCONS (this)->car)
3143                          && INTEGERP (XCONS (this)->cdr))
3144                   {
3145                     int start = XINT (XCONS (this)->car);
3146                     int end = XINT (XCONS (this)->cdr);
3147
3148                     if (start >= 0 && start <= end && end < 256)
3149                       while (start <= end)
3150                         coding->spec.ccl.valid_codes[start++] = 1;
3151                   }
3152               }
3153           }
3154       }
3155       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3156       break;
3157
3158     case 5:
3159       coding->type = coding_type_raw_text;
3160       break;
3161
3162     default:
3163       goto label_invalid_coding_system;
3164     }
3165   return 0;
3166
3167  label_invalid_coding_system:
3168   coding->type = coding_type_no_conversion;
3169   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3170   coding->common_flags = 0;
3171   coding->eol_type = CODING_EOL_LF;
3172   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3173   return -1;
3174 }
3175
3176 /* Setup raw-text or one of its subsidiaries in the structure
3177    coding_system CODING according to the already setup value eol_type
3178    in CODING.  CODING should be setup for some coding system in
3179    advance.  */
3180
3181 void
3182 setup_raw_text_coding_system (coding)
3183      struct coding_system *coding;
3184 {
3185   if (coding->type != coding_type_raw_text)
3186     {
3187       coding->symbol = Qraw_text;
3188       coding->type = coding_type_raw_text;
3189       if (coding->eol_type != CODING_EOL_UNDECIDED)
3190         {
3191           Lisp_Object subsidiaries;
3192           subsidiaries = Fget (Qraw_text, Qeol_type);
3193
3194           if (VECTORP (subsidiaries)
3195               && XVECTOR (subsidiaries)->size == 3)
3196             coding->symbol
3197               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3198         }
3199     }
3200   return;
3201 }
3202
3203 /* Emacs has a mechanism to automatically detect a coding system if it
3204    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3205    it's impossible to distinguish some coding systems accurately
3206    because they use the same range of codes.  So, at first, coding
3207    systems are categorized into 7, those are:
3208
3209    o coding-category-emacs-mule
3210
3211         The category for a coding system which has the same code range
3212         as Emacs' internal format.  Assigned the coding-system (Lisp
3213         symbol) `emacs-mule' by default.
3214
3215    o coding-category-sjis
3216
3217         The category for a coding system which has the same code range
3218         as SJIS.  Assigned the coding-system (Lisp
3219         symbol) `japanese-shift-jis' by default.
3220
3221    o coding-category-iso-7
3222
3223         The category for a coding system which has the same code range
3224         as ISO2022 of 7-bit environment.  This doesn't use any locking
3225         shift and single shift functions.  This can encode/decode all
3226         charsets.  Assigned the coding-system (Lisp symbol)
3227         `iso-2022-7bit' by default.
3228
3229    o coding-category-iso-7-tight
3230
3231         Same as coding-category-iso-7 except that this can
3232         encode/decode only the specified charsets.
3233
3234    o coding-category-iso-8-1
3235
3236         The category for a coding system which has the same code range
3237         as ISO2022 of 8-bit environment and graphic plane 1 used only
3238         for DIMENSION1 charset.  This doesn't use any locking shift
3239         and single shift functions.  Assigned the coding-system (Lisp
3240         symbol) `iso-latin-1' by default.
3241
3242    o coding-category-iso-8-2
3243
3244         The category for a coding system which has the same code range
3245         as ISO2022 of 8-bit environment and graphic plane 1 used only
3246         for DIMENSION2 charset.  This doesn't use any locking shift
3247         and single shift functions.  Assigned the coding-system (Lisp
3248         symbol) `japanese-iso-8bit' by default.
3249
3250    o coding-category-iso-7-else
3251
3252         The category for a coding system which has the same code range
3253         as ISO2022 of 7-bit environemnt but uses locking shift or
3254         single shift functions.  Assigned the coding-system (Lisp
3255         symbol) `iso-2022-7bit-lock' by default.
3256
3257    o coding-category-iso-8-else
3258
3259         The category for a coding system which has the same code range
3260         as ISO2022 of 8-bit environemnt but uses locking shift or
3261         single shift functions.  Assigned the coding-system (Lisp
3262         symbol) `iso-2022-8bit-ss2' by default.
3263
3264    o coding-category-big5
3265
3266         The category for a coding system which has the same code range
3267         as BIG5.  Assigned the coding-system (Lisp symbol)
3268         `cn-big5' by default.
3269
3270    o coding-category-ccl
3271
3272         The category for a coding system of which encoder/decoder is
3273         written in CCL programs.  The default value is nil, i.e., no
3274         coding system is assigned.
3275
3276    o coding-category-binary
3277
3278         The category for a coding system not categorized in any of the
3279         above.  Assigned the coding-system (Lisp symbol)
3280         `no-conversion' by default.
3281
3282    Each of them is a Lisp symbol and the value is an actual
3283    `coding-system's (this is also a Lisp symbol) assigned by a user.
3284    What Emacs does actually is to detect a category of coding system.
3285    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3286    decide only one possible category, it selects a category of the
3287    highest priority.  Priorities of categories are also specified by a
3288    user in a Lisp variable `coding-category-list'.
3289
3290 */
3291
3292 static
3293 int ascii_skip_code[256];
3294
3295 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3296    If it detects possible coding systems, return an integer in which
3297    appropriate flag bits are set.  Flag bits are defined by macros
3298    CODING_CATEGORY_MASK_XXX in `coding.h'.
3299
3300    How many ASCII characters are at the head is returned as *SKIP.  */
3301
3302 static int
3303 detect_coding_mask (source, src_bytes, priorities, skip)
3304      unsigned char *source;
3305      int src_bytes, *priorities, *skip;
3306 {
3307   register unsigned char c;
3308   unsigned char *src = source, *src_end = source + src_bytes;
3309   unsigned int mask;
3310   int i;
3311
3312   /* At first, skip all ASCII characters and control characters except
3313      for three ISO2022 specific control characters.  */
3314   ascii_skip_code[ISO_CODE_SO] = 0;
3315   ascii_skip_code[ISO_CODE_SI] = 0;
3316   ascii_skip_code[ISO_CODE_ESC] = 0;
3317
3318  label_loop_detect_coding:
3319   while (src < src_end && ascii_skip_code[*src]) src++;
3320   *skip = src - source;
3321
3322   if (src >= src_end)
3323     /* We found nothing other than ASCII.  There's nothing to do.  */
3324     return 0;
3325
3326   c = *src;
3327   /* The text seems to be encoded in some multilingual coding system.
3328      Now, try to find in which coding system the text is encoded.  */
3329   if (c < 0x80)
3330     {
3331       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3332       /* C is an ISO2022 specific control code of C0.  */
3333       mask = detect_coding_iso2022 (src, src_end);
3334       if (mask == 0)
3335         {
3336           /* No valid ISO2022 code follows C.  Try again.  */
3337           src++;
3338           if (c == ISO_CODE_ESC)
3339             ascii_skip_code[ISO_CODE_ESC] = 1;
3340           else
3341             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3342           goto label_loop_detect_coding;
3343         }
3344       if (priorities)
3345         goto label_return_highest_only;
3346     }
3347   else
3348     {
3349       int try;
3350
3351       if (c < 0xA0)
3352         {
3353           /* C is the first byte of SJIS character code,
3354              or a leading-code of Emacs' internal format (emacs-mule).  */
3355           try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3356
3357           /* Or, if C is a special latin extra code,
3358              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3359              or is an ISO2022 control-sequence-introducer (CSI),
3360              we should also consider the possibility of ISO2022 codings.  */
3361           if ((VECTORP (Vlatin_extra_code_table)
3362                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3363               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3364               || (c == ISO_CODE_CSI
3365                   && (src < src_end
3366                       && (*src == ']'
3367                           || ((*src == '0' || *src == '1' || *src == '2')
3368                               && src + 1 < src_end
3369                               && src[1] == ']')))))
3370             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3371                      | CODING_CATEGORY_MASK_ISO_8BIT);
3372         }
3373       else
3374         /* C is a character of ISO2022 in graphic plane right,
3375            or a SJIS's 1-byte character code (i.e. JISX0201),
3376            or the first byte of BIG5's 2-byte code.  */
3377         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3378                 | CODING_CATEGORY_MASK_ISO_8BIT
3379                 | CODING_CATEGORY_MASK_SJIS
3380                 | CODING_CATEGORY_MASK_BIG5);
3381
3382       /* Or, we may have to consider the possibility of CCL.  */
3383       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3384           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3385               ->spec.ccl.valid_codes)[c])
3386         try |= CODING_CATEGORY_MASK_CCL;
3387
3388       mask = 0;
3389       if (priorities)
3390         {
3391           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3392             {
3393               if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
3394                 mask = detect_coding_iso2022 (src, src_end);
3395               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3396                 mask = detect_coding_sjis (src, src_end);
3397               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3398                 mask = detect_coding_big5 (src, src_end);
3399               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3400                 mask = detect_coding_emacs_mule (src, src_end);
3401               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3402                 mask = detect_coding_ccl (src, src_end);
3403               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3404                 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3405               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3406                 mask = CODING_CATEGORY_MASK_BINARY;
3407               if (mask)
3408                 goto label_return_highest_only;
3409             }
3410           return CODING_CATEGORY_MASK_RAW_TEXT;
3411         }
3412       if (try & CODING_CATEGORY_MASK_ISO)
3413         mask |= detect_coding_iso2022 (src, src_end);
3414       if (try & CODING_CATEGORY_MASK_SJIS)
3415         mask |= detect_coding_sjis (src, src_end);
3416       if (try & CODING_CATEGORY_MASK_BIG5)
3417         mask |= detect_coding_big5 (src, src_end);
3418       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3419         mask |= detect_coding_emacs_mule (src, src_end);
3420       if (try & CODING_CATEGORY_MASK_CCL)
3421         mask |= detect_coding_ccl (src, src_end);
3422     }
3423   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3424
3425  label_return_highest_only:
3426   for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3427     {
3428       if (mask & priorities[i])
3429         return priorities[i];
3430     }
3431   return CODING_CATEGORY_MASK_RAW_TEXT;
3432 }
3433
3434 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3435    The information of the detected coding system is set in CODING.  */
3436
3437 void
3438 detect_coding (coding, src, src_bytes)
3439      struct coding_system *coding;
3440      unsigned char *src;
3441      int src_bytes;
3442 {
3443   unsigned int idx;
3444   int skip, mask, i;
3445   Lisp_Object val;
3446
3447   val = Vcoding_category_list;
3448   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3449   coding->heading_ascii = skip;
3450
3451   if (!mask) return;
3452
3453   /* We found a single coding system of the highest priority in MASK.  */
3454   idx = 0;
3455   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3456   if (! mask)
3457     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3458
3459   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3460
3461   if (coding->eol_type != CODING_EOL_UNDECIDED)
3462     {
3463       Lisp_Object tmp;
3464
3465       tmp = Fget (val, Qeol_type);
3466       if (VECTORP (tmp))
3467         val = XVECTOR (tmp)->contents[coding->eol_type];
3468     }
3469   setup_coding_system (val, coding);
3470   /* Set this again because setup_coding_system reset this member.  */
3471   coding->heading_ascii = skip;
3472 }
3473
3474 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3475    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3476    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3477
3478    How many non-eol characters are at the head is returned as *SKIP.  */
3479
3480 #define MAX_EOL_CHECK_COUNT 3
3481
3482 static int
3483 detect_eol_type (source, src_bytes, skip)
3484      unsigned char *source;
3485      int src_bytes, *skip;
3486 {
3487   unsigned char *src = source, *src_end = src + src_bytes;
3488   unsigned char c;
3489   int total = 0;                /* How many end-of-lines are found so far.  */
3490   int eol_type = CODING_EOL_UNDECIDED;
3491   int this_eol_type;
3492
3493   *skip = 0;
3494
3495   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3496     {
3497       c = *src++;
3498       if (c == '\n' || c == '\r')
3499         {
3500           if (*skip == 0)
3501             *skip = src - 1 - source;
3502           total++;
3503           if (c == '\n')
3504             this_eol_type = CODING_EOL_LF;
3505           else if (src >= src_end || *src != '\n')
3506             this_eol_type = CODING_EOL_CR;
3507           else
3508             this_eol_type = CODING_EOL_CRLF, src++;
3509
3510           if (eol_type == CODING_EOL_UNDECIDED)
3511             /* This is the first end-of-line.  */
3512             eol_type = this_eol_type;
3513           else if (eol_type != this_eol_type)
3514             {
3515               /* The found type is different from what found before.  */
3516               eol_type = CODING_EOL_INCONSISTENT;
3517               break;
3518             }
3519         }
3520     }
3521
3522   if (*skip == 0)
3523     *skip = src_end - source;
3524   return eol_type;
3525 }
3526
3527 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3528    is encoded.  If it detects an appropriate format of end-of-line, it
3529    sets the information in *CODING.  */
3530
3531 void
3532 detect_eol (coding, src, src_bytes)
3533      struct coding_system *coding;
3534      unsigned char *src;
3535      int src_bytes;
3536 {
3537   Lisp_Object val;
3538   int skip;
3539   int eol_type = detect_eol_type (src, src_bytes, &skip);
3540
3541   if (coding->heading_ascii > skip)
3542     coding->heading_ascii = skip;
3543   else
3544     skip = coding->heading_ascii;
3545
3546   if (eol_type == CODING_EOL_UNDECIDED)
3547     return;
3548   if (eol_type == CODING_EOL_INCONSISTENT)
3549     {
3550 #if 0
3551       /* This code is suppressed until we find a better way to
3552          distinguish raw text file and binary file.  */
3553
3554       /* If we have already detected that the coding is raw-text, the
3555          coding should actually be no-conversion.  */
3556       if (coding->type == coding_type_raw_text)
3557         {
3558           setup_coding_system (Qno_conversion, coding);
3559           return;
3560         }
3561       /* Else, let's decode only text code anyway.  */
3562 #endif /* 0 */
3563       eol_type = CODING_EOL_LF;
3564     }
3565
3566   val = Fget (coding->symbol, Qeol_type);
3567   if (VECTORP (val) && XVECTOR (val)->size == 3)
3568     {
3569       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3570       coding->heading_ascii = skip;
3571     }
3572 }
3573
3574 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3575
3576 #define DECODING_BUFFER_MAG(coding)                                          \
3577   (coding->type == coding_type_iso2022                                       \
3578    ? 3                                                                       \
3579    : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3580       ? 2                                                                    \
3581       : (coding->type == coding_type_raw_text                                \
3582          ? 1                                                                 \
3583          : (coding->type == coding_type_ccl                                  \
3584             ? coding->spec.ccl.decoder.buf_magnification                     \
3585             : 2))))
3586
3587 /* Return maximum size (bytes) of a buffer enough for decoding
3588    SRC_BYTES of text encoded in CODING.  */
3589
3590 int
3591 decoding_buffer_size (coding, src_bytes)
3592      struct coding_system *coding;
3593      int src_bytes;
3594 {
3595   return (src_bytes * DECODING_BUFFER_MAG (coding)
3596           + CONVERSION_BUFFER_EXTRA_ROOM);
3597 }
3598
3599 /* Return maximum size (bytes) of a buffer enough for encoding
3600    SRC_BYTES of text to CODING.  */
3601
3602 int
3603 encoding_buffer_size (coding, src_bytes)
3604      struct coding_system *coding;
3605      int src_bytes;
3606 {
3607   int magnification;
3608
3609   if (coding->type == coding_type_ccl)
3610     magnification = coding->spec.ccl.encoder.buf_magnification;
3611   else
3612     magnification = 3;
3613
3614   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3615 }
3616
3617 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3618 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3619 #endif
3620
3621 char *conversion_buffer;
3622 int conversion_buffer_size;
3623
3624 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3625    or decoding.  Sufficient memory is allocated automatically.  If we
3626    run out of memory, return NULL.  */
3627
3628 char *
3629 get_conversion_buffer (size)
3630      int size;
3631 {
3632   if (size > conversion_buffer_size)
3633     {
3634       char *buf;
3635       int real_size = conversion_buffer_size * 2;
3636
3637       while (real_size < size) real_size *= 2;
3638       buf = (char *) xmalloc (real_size);
3639       xfree (conversion_buffer);
3640       conversion_buffer = buf;
3641       conversion_buffer_size = real_size;
3642     }
3643   return conversion_buffer;
3644 }
3645
3646 int
3647 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3648      struct coding_system *coding;
3649      unsigned char *source, *destination;
3650      int src_bytes, dst_bytes, encodep;
3651 {
3652   struct ccl_program *ccl
3653     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3654   int result;
3655
3656   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3657
3658   coding->produced = ccl_driver (ccl, source, destination,
3659                                  src_bytes, dst_bytes, &(coding->consumed));
3660   coding->produced_char
3661     = multibyte_chars_in_text (destination, coding->produced);
3662   coding->consumed_char
3663     = multibyte_chars_in_text (source, coding->consumed);
3664
3665   switch (ccl->status)
3666     {
3667     case CCL_STAT_SUSPEND_BY_SRC:
3668       result = CODING_FINISH_INSUFFICIENT_SRC;
3669       break;
3670     case CCL_STAT_SUSPEND_BY_DST:
3671       result = CODING_FINISH_INSUFFICIENT_DST;
3672       break;
3673     case CCL_STAT_QUIT:
3674     case CCL_STAT_INVALID_CMD:
3675       result = CODING_FINISH_INTERRUPT;
3676       break;
3677     default:
3678       result = CODING_FINISH_NORMAL;
3679       break;
3680     }
3681   return result;
3682 }
3683
3684 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
3685    decoding, it may detect coding system and format of end-of-line if
3686    those are not yet decided.  */
3687
3688 int
3689 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3690      struct coding_system *coding;
3691      unsigned char *source, *destination;
3692      int src_bytes, dst_bytes;
3693 {
3694   int result;
3695
3696   if (src_bytes <= 0
3697       && ! (coding->mode & CODING_MODE_LAST_BLOCK
3698             && CODING_REQUIRE_FLUSHING (coding)))
3699     {
3700       coding->produced = coding->produced_char = 0;
3701       coding->consumed = coding->consumed_char = 0;
3702       coding->fake_multibyte = 0;
3703       return CODING_FINISH_NORMAL;
3704     }
3705
3706   if (coding->type == coding_type_undecided)
3707     detect_coding (coding, source, src_bytes);
3708
3709   if (coding->eol_type == CODING_EOL_UNDECIDED)
3710     detect_eol (coding, source, src_bytes);
3711
3712   switch (coding->type)
3713     {
3714     case coding_type_emacs_mule:
3715     case coding_type_undecided:
3716     case coding_type_raw_text:
3717       if (coding->eol_type == CODING_EOL_LF
3718           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3719         goto label_no_conversion;
3720       result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3721       break;
3722
3723     case coding_type_sjis:
3724       result = decode_coding_sjis_big5 (coding, source, destination,
3725                                         src_bytes, dst_bytes, 1);
3726       break;
3727
3728     case coding_type_iso2022:
3729       result = decode_coding_iso2022 (coding, source, destination,
3730                                       src_bytes, dst_bytes);
3731       break;
3732
3733     case coding_type_big5:
3734       result = decode_coding_sjis_big5 (coding, source, destination,
3735                                         src_bytes, dst_bytes, 0);
3736       break;
3737
3738     case coding_type_ccl:
3739       result = ccl_coding_driver (coding, source, destination,
3740                                   src_bytes, dst_bytes, 0);
3741       break;
3742
3743     default:                    /* i.e. case coding_type_no_conversion: */
3744     label_no_conversion:
3745       if (dst_bytes && src_bytes > dst_bytes)
3746         {
3747           coding->produced = dst_bytes;
3748           result = CODING_FINISH_INSUFFICIENT_DST;
3749         }
3750       else
3751         {
3752           coding->produced = src_bytes;
3753           result = CODING_FINISH_NORMAL;
3754         }
3755       if (dst_bytes)
3756         bcopy (source, destination, coding->produced);
3757       else
3758         safe_bcopy (source, destination, coding->produced);
3759       coding->fake_multibyte = 1;
3760       coding->consumed
3761         = coding->consumed_char = coding->produced_char = coding->produced;
3762       break;
3763     }
3764
3765   return result;
3766 }
3767
3768 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
3769
3770 int
3771 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3772      struct coding_system *coding;
3773      unsigned char *source, *destination;
3774      int src_bytes, dst_bytes;
3775 {
3776   int result;
3777
3778   if (src_bytes <= 0
3779       && ! (coding->mode & CODING_MODE_LAST_BLOCK
3780             && CODING_REQUIRE_FLUSHING (coding)))
3781     {
3782       coding->produced = coding->produced_char = 0;
3783       coding->consumed = coding->consumed_char = 0;
3784       coding->fake_multibyte = 0;
3785       return CODING_FINISH_NORMAL;
3786     }
3787
3788   switch (coding->type)
3789     {
3790     case coding_type_emacs_mule:
3791     case coding_type_undecided:
3792     case coding_type_raw_text:
3793       if (coding->eol_type == CODING_EOL_LF
3794           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3795         goto label_no_conversion;
3796       result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3797       break;
3798
3799     case coding_type_sjis:
3800       result = encode_coding_sjis_big5 (coding, source, destination,
3801                                         src_bytes, dst_bytes, 1);
3802       break;
3803
3804     case coding_type_iso2022:
3805       result = encode_coding_iso2022 (coding, source, destination,
3806                                       src_bytes, dst_bytes);
3807       break;
3808
3809     case coding_type_big5:
3810       result = encode_coding_sjis_big5 (coding, source, destination,
3811                                         src_bytes, dst_bytes, 0);
3812       break;
3813
3814     case coding_type_ccl:
3815       result = ccl_coding_driver (coding, source, destination,
3816                                   src_bytes, dst_bytes, 1);
3817       break;
3818
3819     default:                    /* i.e. case coding_type_no_conversion: */
3820     label_no_conversion:
3821       if (dst_bytes && src_bytes > dst_bytes)
3822         {
3823           coding->produced = dst_bytes;
3824           result = CODING_FINISH_INSUFFICIENT_DST;
3825         }
3826       else
3827         {
3828           coding->produced = src_bytes;
3829           result = CODING_FINISH_NORMAL;
3830         }
3831       if (dst_bytes)
3832         bcopy (source, destination, coding->produced);
3833       else
3834         safe_bcopy (source, destination, coding->produced);
3835       if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3836         {
3837           unsigned char *p = destination, *pend = p + coding->produced;
3838           while (p < pend)
3839             if (*p++ == '\015') p[-1] = '\n';
3840         }
3841       coding->fake_multibyte = 1;
3842       coding->consumed
3843         = coding->consumed_char = coding->produced_char = coding->produced;
3844       break;
3845     }
3846
3847   return result;
3848 }
3849
3850 /* Scan text in the region between *BEG and *END (byte positions),
3851    skip characters which we don't have to decode by coding system
3852    CODING at the head and tail, then set *BEG and *END to the region
3853    of the text we actually have to convert.  The caller should move
3854    the gap out of the region in advance.
3855
3856    If STR is not NULL, *BEG and *END are indices into STR.  */
3857
3858 static void
3859 shrink_decoding_region (beg, end, coding, str)
3860      int *beg, *end;
3861      struct coding_system *coding;
3862      unsigned char *str;
3863 {
3864   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3865   int eol_conversion;
3866   Lisp_Object translation_table;
3867
3868   if (coding->type == coding_type_ccl
3869       || coding->type == coding_type_undecided
3870       || !NILP (coding->post_read_conversion))
3871     {
3872       /* We can't skip any data.  */
3873       return;
3874     }
3875   else if (coding->type == coding_type_no_conversion)
3876     {
3877       /* We need no conversion, but don't have to skip any data here.
3878          Decoding routine handles them effectively anyway.  */
3879       return;
3880     }
3881
3882   translation_table = coding->translation_table_for_decode;
3883   if (NILP (translation_table) && !NILP (Venable_character_translation))
3884     translation_table = Vstandard_translation_table_for_decode;
3885   if (CHAR_TABLE_P (translation_table))
3886     {
3887       int i;
3888       for (i = 0; i < 128; i++)
3889         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3890           break;
3891       if (i < 128)
3892         /* Some ASCII character should be tranlsated.  We give up
3893            shrinking.  */
3894         return;
3895     }
3896
3897   eol_conversion = (coding->eol_type != CODING_EOL_LF);
3898
3899   if ((! eol_conversion) && (coding->heading_ascii >= 0))
3900     /* Detection routine has already found how much we can skip at the
3901        head.  */
3902     *beg += coding->heading_ascii;
3903
3904   if (str)
3905     {
3906       begp_orig = begp = str + *beg;
3907       endp_orig = endp = str + *end;
3908     }
3909   else
3910     {
3911       begp_orig = begp = BYTE_POS_ADDR (*beg);
3912       endp_orig = endp = begp + *end - *beg;
3913     }
3914
3915   switch (coding->type)
3916     {
3917     case coding_type_emacs_mule:
3918     case coding_type_raw_text:
3919       if (eol_conversion)
3920         {
3921           if (coding->heading_ascii < 0)
3922             while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3923           while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
3924             endp--;
3925           /* Do not consider LF as ascii if preceded by CR, since that
3926              confuses eol decoding. */
3927           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3928             endp++;
3929         }
3930       else
3931         begp = endp;
3932       break;
3933
3934     case coding_type_sjis:
3935     case coding_type_big5:
3936       /* We can skip all ASCII characters at the head.  */
3937       if (coding->heading_ascii < 0)
3938         {
3939           if (eol_conversion)
3940             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
3941           else
3942             while (begp < endp && *begp < 0x80) begp++;
3943         }
3944       /* We can skip all ASCII characters at the tail except for the
3945          second byte of SJIS or BIG5 code.  */
3946       if (eol_conversion)
3947         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
3948       else
3949         while (begp < endp && endp[-1] < 0x80) endp--;
3950       /* Do not consider LF as ascii if preceded by CR, since that
3951          confuses eol decoding. */
3952       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3953         endp++;
3954       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3955         endp++;
3956       break;
3957
3958     default:            /* i.e. case coding_type_iso2022: */
3959       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
3960         /* We can't skip any data.  */
3961         break;
3962       if (coding->heading_ascii < 0)
3963         {
3964           /* We can skip all ASCII characters at the head except for a
3965              few control codes.  */
3966           while (begp < endp && (c = *begp) < 0x80
3967                  && c != ISO_CODE_CR && c != ISO_CODE_SO
3968                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
3969                  && (!eol_conversion || c != ISO_CODE_LF))
3970             begp++;
3971         }
3972       switch (coding->category_idx)
3973         {
3974         case CODING_CATEGORY_IDX_ISO_8_1:
3975         case CODING_CATEGORY_IDX_ISO_8_2:
3976           /* We can skip all ASCII characters at the tail.  */
3977           if (eol_conversion)
3978             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
3979           else
3980             while (begp < endp && endp[-1] < 0x80) endp--;
3981           /* Do not consider LF as ascii if preceded by CR, since that
3982              confuses eol decoding. */
3983           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3984             endp++;
3985           break;
3986
3987         case CODING_CATEGORY_IDX_ISO_7:
3988         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
3989           {
3990             /* We can skip all charactes at the tail except for 8-bit
3991                codes and ESC and the following 2-byte at the tail.  */
3992             unsigned char *eight_bit = NULL;
3993
3994             if (eol_conversion)
3995               while (begp < endp
3996                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
3997                 {
3998                   if (!eight_bit && c & 0x80) eight_bit = endp;
3999                   endp--;
4000                 }
4001             else
4002               while (begp < endp
4003                      && (c = endp[-1]) != ISO_CODE_ESC)
4004                 {
4005                   if (!eight_bit && c & 0x80) eight_bit = endp;
4006                   endp--;
4007                 }
4008             /* Do not consider LF as ascii if preceded by CR, since that
4009                confuses eol decoding. */
4010             if (begp < endp && endp < endp_orig
4011                 && endp[-1] == '\r' && endp[0] == '\n')
4012               endp++;
4013             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4014               {
4015                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4016                   /* This is an ASCII designation sequence.  We can
4017                      surely skip the tail.  But, if we have
4018                      encountered an 8-bit code, skip only the codes
4019                      after that.  */
4020                   endp = eight_bit ? eight_bit : endp + 2;
4021                 else
4022                   /* Hmmm, we can't skip the tail.  */
4023                   endp = endp_orig;
4024               }
4025             else if (eight_bit)
4026               endp = eight_bit;
4027           }
4028         }
4029     }
4030   *beg += begp - begp_orig;
4031   *end += endp - endp_orig;
4032   return;
4033 }
4034
4035 /* Like shrink_decoding_region but for encoding.  */
4036
4037 static void
4038 shrink_encoding_region (beg, end, coding, str)
4039      int *beg, *end;
4040      struct coding_system *coding;
4041      unsigned char *str;
4042 {
4043   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4044   int eol_conversion;
4045   Lisp_Object translation_table;
4046
4047   if (coding->type == coding_type_ccl)
4048     /* We can't skip any data.  */
4049     return;
4050   else if (coding->type == coding_type_no_conversion)
4051     {
4052       /* We need no conversion.  */
4053       *beg = *end;
4054       return;
4055     }
4056
4057   translation_table = coding->translation_table_for_encode;
4058   if (NILP (translation_table) && !NILP (Venable_character_translation))
4059     translation_table = Vstandard_translation_table_for_encode;
4060   if (CHAR_TABLE_P (translation_table))
4061     {
4062       int i;
4063       for (i = 0; i < 128; i++)
4064         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4065           break;
4066       if (i < 128)
4067         /* Some ASCII character should be tranlsated.  We give up
4068            shrinking.  */
4069         return;
4070     }
4071
4072   if (str)
4073     {
4074       begp_orig = begp = str + *beg;
4075       endp_orig = endp = str + *end;
4076     }
4077   else
4078     {
4079       begp_orig = begp = BYTE_POS_ADDR (*beg);
4080       endp_orig = endp = begp + *end - *beg;
4081     }
4082
4083   eol_conversion = (coding->eol_type == CODING_EOL_CR
4084                     || coding->eol_type == CODING_EOL_CRLF);
4085
4086   /* Here, we don't have to check coding->pre_write_conversion because
4087      the caller is expected to have handled it already.  */
4088   switch (coding->type)
4089     {
4090     case coding_type_undecided:
4091     case coding_type_emacs_mule:
4092     case coding_type_raw_text:
4093       if (eol_conversion)
4094         {
4095           while (begp < endp && *begp != '\n') begp++;
4096           while (begp < endp && endp[-1] != '\n') endp--;
4097         }
4098       else
4099         begp = endp;
4100       break;
4101
4102     case coding_type_iso2022:
4103       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4104         /* We can't skip any data.  */
4105         break;
4106       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4107         {
4108           unsigned char *bol = begp;
4109           while (begp < endp && *begp < 0x80)
4110             {
4111               begp++;
4112               if (begp[-1] == '\n')
4113                 bol = begp;
4114             }
4115           begp = bol;
4116           goto label_skip_tail;
4117         }
4118       /* fall down ... */
4119
4120     default:
4121       /* We can skip all ASCII characters at the head and tail.  */
4122       if (eol_conversion)
4123         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4124       else
4125         while (begp < endp && *begp < 0x80) begp++;
4126     label_skip_tail:
4127       if (eol_conversion)
4128         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4129       else
4130         while (begp < endp && *(endp - 1) < 0x80) endp--;
4131       break;
4132     }
4133
4134   *beg += begp - begp_orig;
4135   *end += endp - endp_orig;
4136   return;
4137 }
4138
4139 /* As shrinking conversion region requires some overhead, we don't try
4140    shrinking if the length of conversion region is less than this
4141    value.  */
4142 static int shrink_conversion_region_threshhold = 1024;
4143
4144 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4145   do {                                                                  \
4146     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4147       {                                                                 \
4148         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4149         else shrink_decoding_region (beg, end, coding, str);            \
4150       }                                                                 \
4151   } while (0)
4152
4153 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4154    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4155    coding system CODING, and return the status code of code conversion
4156    (currently, this value has no meaning).
4157
4158    How many characters (and bytes) are converted to how many
4159    characters (and bytes) are recorded in members of the structure
4160    CODING.
4161
4162    If REPLACE is nonzero, we do various things as if the original text
4163    is deleted and a new text is inserted.  See the comments in
4164    replace_range (insdel.c) to know what we are doing.  */
4165
4166 int
4167 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4168      int from, from_byte, to, to_byte, encodep, replace;
4169      struct coding_system *coding;
4170 {
4171   int len = to - from, len_byte = to_byte - from_byte;
4172   int require, inserted, inserted_byte;
4173   int head_skip, tail_skip, total_skip;
4174   Lisp_Object saved_coding_symbol;
4175   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4176   int first = 1;
4177   int fake_multibyte = 0;
4178   unsigned char *src, *dst;
4179   Lisp_Object deletion;
4180   int orig_point = PT, orig_len = len;
4181   int prev_Z;
4182
4183   deletion = Qnil;
4184   saved_coding_symbol = Qnil;
4185
4186   if (from < PT && PT < to)
4187     {
4188       TEMP_SET_PT_BOTH (from, from_byte);
4189       orig_point = from;
4190     }
4191
4192   if (replace)
4193     {
4194       int saved_from = from;
4195
4196       prepare_to_modify_buffer (from, to, &from);
4197       if (saved_from != from)
4198         {
4199           to = from + len;
4200           if (multibyte)
4201             from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4202           else
4203             from_byte = from, to_byte = to;
4204           len_byte = to_byte - from_byte;
4205         }
4206     }
4207
4208   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4209     {
4210       /* We must detect encoding of text and eol format.  */
4211
4212       if (from < GPT && to > GPT)
4213         move_gap_both (from, from_byte);
4214       if (coding->type == coding_type_undecided)
4215         {
4216           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4217           if (coding->type == coding_type_undecided)
4218             /* It seems that the text contains only ASCII, but we
4219                should not left it undecided because the deeper
4220                decoding routine (decode_coding) tries to detect the
4221                encodings again in vain.  */
4222             coding->type = coding_type_emacs_mule;
4223         }
4224       if (coding->eol_type == CODING_EOL_UNDECIDED)
4225         {
4226           saved_coding_symbol = coding->symbol;
4227           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4228           if (coding->eol_type == CODING_EOL_UNDECIDED)
4229             coding->eol_type = CODING_EOL_LF;
4230           /* We had better recover the original eol format if we
4231              encounter an inconsitent eol format while decoding.  */
4232           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4233         }
4234     }
4235
4236   coding->consumed_char = len, coding->consumed = len_byte;
4237
4238   if (encodep
4239       ? ! CODING_REQUIRE_ENCODING (coding)
4240       : ! CODING_REQUIRE_DECODING (coding))
4241     {
4242       coding->produced = len_byte;
4243       if (multibyte
4244           && ! replace
4245           /* See the comment of the member heading_ascii in coding.h.  */
4246           && coding->heading_ascii < len_byte)
4247         {
4248           /* We still may have to combine byte at the head and the
4249              tail of the text in the region.  */
4250           if (from < GPT && GPT < to)
4251             move_gap_both (to, to_byte);
4252           len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4253           adjust_after_insert (from, from_byte, to, to_byte, len);
4254           coding->produced_char = len;
4255         }
4256       else
4257         {
4258           if (!replace)
4259             adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4260           coding->produced_char = len_byte;
4261         }
4262       return 0;
4263     }
4264
4265   /* Now we convert the text.  */
4266
4267   /* For encoding, we must process pre-write-conversion in advance.  */
4268   if (encodep
4269       && ! NILP (coding->pre_write_conversion)
4270       && SYMBOLP (coding->pre_write_conversion)
4271       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4272     {
4273       /* The function in pre-write-conversion may put a new text in a
4274          new buffer.  */
4275       struct buffer *prev = current_buffer;
4276       Lisp_Object new;
4277
4278       call2 (coding->pre_write_conversion,
4279              make_number (from), make_number (to));
4280       if (current_buffer != prev)
4281         {
4282           len = ZV - BEGV;
4283           new = Fcurrent_buffer ();
4284           set_buffer_internal_1 (prev);
4285           del_range_2 (from, from_byte, to, to_byte);
4286           TEMP_SET_PT_BOTH (from, from_byte);
4287           insert_from_buffer (XBUFFER (new), 1, len, 0);
4288           Fkill_buffer (new);
4289           if (orig_point >= to)
4290             orig_point += len - orig_len;
4291           else if (orig_point > from)
4292             orig_point = from;
4293           orig_len = len;
4294           to = from + len;
4295           from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4296           to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4297           len_byte = to_byte - from_byte;
4298           TEMP_SET_PT_BOTH (from, from_byte);
4299         }
4300     }
4301
4302   if (replace)
4303     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4304
4305   /* Try to skip the heading and tailing ASCIIs.  */
4306   {
4307     int from_byte_orig = from_byte, to_byte_orig = to_byte;
4308
4309     if (from < GPT && GPT < to)
4310       move_gap_both (from, from_byte);
4311     SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4312     if (from_byte == to_byte
4313         && ! (coding->mode & CODING_MODE_LAST_BLOCK
4314               && CODING_REQUIRE_FLUSHING (coding)))
4315       {
4316         coding->produced = len_byte;
4317         coding->produced_char = multibyte ? len : len_byte;
4318         if (!replace)
4319           /* We must record and adjust for this new text now.  */
4320           adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4321         return 0;
4322       }
4323
4324     head_skip = from_byte - from_byte_orig;
4325     tail_skip = to_byte_orig - to_byte;
4326     total_skip = head_skip + tail_skip;
4327     from += head_skip;
4328     to -= tail_skip;
4329     len -= total_skip; len_byte -= total_skip;
4330   }
4331
4332   /* The code conversion routine can not preserve text properties for
4333      now.  So, we must remove all text properties in the region.
4334      Here, we must suppress all modification hooks.  */
4335   if (replace)
4336     {
4337       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4338       inhibit_modification_hooks = 1;
4339       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4340       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4341     }
4342
4343   /* For converion, we must put the gap before the text in addition to
4344      making the gap larger for efficient decoding.  The required gap
4345      size starts from 2000 which is the magic number used in make_gap.
4346      But, after one batch of conversion, it will be incremented if we
4347      find that it is not enough .  */
4348   require = 2000;
4349
4350   if (GAP_SIZE  < require)
4351     make_gap (require - GAP_SIZE);
4352   move_gap_both (from, from_byte);
4353
4354   inserted = inserted_byte = 0;
4355   src = GAP_END_ADDR, dst = GPT_ADDR;
4356
4357   GAP_SIZE += len_byte;
4358   ZV -= len;
4359   Z -= len;
4360   ZV_BYTE -= len_byte;
4361   Z_BYTE -= len_byte;
4362
4363   if (GPT - BEG < beg_unchanged)
4364     beg_unchanged = GPT - BEG;
4365   if (Z - GPT < end_unchanged)
4366     end_unchanged = Z - GPT;
4367
4368   for (;;)
4369     {
4370       int result;
4371
4372       /* The buffer memory is changed from:
4373          +--------+converted-text+---------+-------original-text------+---+
4374          |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4375                   |<------------------- GAP_SIZE -------------------->|  */
4376       if (encodep)
4377         result = encode_coding (coding, src, dst, len_byte, 0);
4378       else
4379         result = decode_coding (coding, src, dst, len_byte, 0);
4380       /* to:
4381          +--------+-------converted-text--------+--+---original-text--+---+
4382          |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4383                   |<------------------- GAP_SIZE -------------------->|  */
4384       if (coding->fake_multibyte)
4385         fake_multibyte = 1;
4386
4387       if (!encodep && !multibyte)
4388         coding->produced_char = coding->produced;
4389       inserted += coding->produced_char;
4390       inserted_byte += coding->produced;
4391       len_byte -= coding->consumed;
4392       src += coding->consumed;
4393       dst += inserted_byte;
4394
4395       if (result == CODING_FINISH_NORMAL)
4396         {
4397           src += len_byte;
4398           break;
4399         }
4400       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4401         {
4402           unsigned char *pend = dst, *p = pend - inserted_byte;
4403
4404           /* Encode LFs back to the original eol format (CR or CRLF).  */
4405           if (coding->eol_type == CODING_EOL_CR)
4406             {
4407               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4408             }
4409           else
4410             {
4411               int count = 0;
4412
4413               while (p < pend) if (*p++ == '\n') count++;
4414               if (src - dst < count)
4415                 {
4416                   /* We don't have sufficient room for putting LFs
4417                      back to CRLF.  We must record converted and
4418                      not-yet-converted text back to the buffer
4419                      content, enlarge the gap, then record them out of
4420                      the buffer contents again.  */
4421                   int add = len_byte + inserted_byte;
4422
4423                   GAP_SIZE -= add;
4424                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4425                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4426                   make_gap (count - GAP_SIZE);
4427                   GAP_SIZE += add;
4428                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4429                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4430                   /* Don't forget to update SRC, DST, and PEND.  */
4431                   src = GAP_END_ADDR - len_byte;
4432                   dst = GPT_ADDR + inserted_byte;
4433                   pend = dst;
4434                 }
4435               inserted += count;
4436               inserted_byte += count;
4437               coding->produced += count;
4438               p = dst = pend + count;
4439               while (count)
4440                 {
4441                   *--p = *--pend;
4442                   if (*p == '\n') count--, *--p = '\r';
4443                 }
4444             }
4445
4446           /* Suppress eol-format conversion in the further conversion.  */
4447           coding->eol_type = CODING_EOL_LF;
4448
4449           /* Restore the original symbol.  */
4450           coding->symbol = saved_coding_symbol;
4451
4452           continue;
4453         }
4454       if (len_byte <= 0)
4455         break;
4456       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4457         {
4458           /* The source text ends in invalid codes.  Let's just
4459              make them valid buffer contents, and finish conversion.  */
4460           inserted += len_byte;
4461           inserted_byte += len_byte;
4462           while (len_byte--)
4463             *dst++ = *src++;
4464           fake_multibyte = 1;
4465           break;
4466         }
4467       if (result == CODING_FINISH_INTERRUPT)
4468         {
4469           /* The conversion procedure was interrupted by a user.  */
4470           fake_multibyte = 1;
4471           break;
4472         }
4473       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
4474       if (coding->consumed < 1)
4475         {
4476           /* It's quite strange to require more memory without
4477              consuming any bytes.  Perhaps CCL program bug.  */
4478           fake_multibyte = 1;
4479           break;
4480         }
4481       if (first)
4482         {
4483           /* We have just done the first batch of conversion which was
4484              stoped because of insufficient gap.  Let's reconsider the
4485              required gap size (i.e. SRT - DST) now.
4486
4487              We have converted ORIG bytes (== coding->consumed) into
4488              NEW bytes (coding->produced).  To convert the remaining
4489              LEN bytes, we may need REQUIRE bytes of gap, where:
4490                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4491                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4492              Here, we are sure that NEW >= ORIG.  */
4493           float ratio = coding->produced - coding->consumed;
4494           ratio /= coding->consumed;
4495           require = len_byte * ratio;
4496           first = 0;
4497         }
4498       if ((src - dst) < (require + 2000))
4499         {
4500           /* See the comment above the previous call of make_gap.  */
4501           int add = len_byte + inserted_byte;
4502
4503           GAP_SIZE -= add;
4504           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4505           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4506           make_gap (require + 2000);
4507           GAP_SIZE += add;
4508           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4509           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4510           /* Don't forget to update SRC, DST.  */
4511           src = GAP_END_ADDR - len_byte;
4512           dst = GPT_ADDR + inserted_byte;
4513         }
4514     }
4515   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
4516
4517   if (multibyte
4518       && (encodep
4519           || fake_multibyte
4520           || (to - from) != (to_byte - from_byte)))
4521     inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4522
4523   /* If we have shrinked the conversion area, adjust it now.  */
4524   if (total_skip > 0)
4525     {
4526       if (tail_skip > 0)
4527         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4528       inserted += total_skip; inserted_byte += total_skip;
4529       GAP_SIZE += total_skip;
4530       GPT -= head_skip; GPT_BYTE -= head_skip;
4531       ZV -= total_skip; ZV_BYTE -= total_skip;
4532       Z -= total_skip; Z_BYTE -= total_skip;
4533       from -= head_skip; from_byte -= head_skip;
4534       to += tail_skip; to_byte += tail_skip;
4535     }
4536
4537   prev_Z = Z;
4538   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4539   inserted = Z - prev_Z;
4540
4541   if (! encodep && ! NILP (coding->post_read_conversion))
4542     {
4543       Lisp_Object val;
4544
4545       if (from != PT)
4546         TEMP_SET_PT_BOTH (from, from_byte);
4547       prev_Z = Z;
4548       val = call1 (coding->post_read_conversion, make_number (inserted));
4549       CHECK_NUMBER (val, 0);
4550       inserted = Z - prev_Z;
4551     }
4552
4553   if (orig_point >= from)
4554     {
4555       if (orig_point >= from + orig_len)
4556         orig_point += inserted - orig_len;
4557       else
4558         orig_point = from;
4559       TEMP_SET_PT (orig_point);
4560     }
4561
4562   signal_after_change (from, to - from, inserted);
4563
4564   {
4565     coding->consumed = to_byte - from_byte;
4566     coding->consumed_char = to - from;
4567     coding->produced = inserted_byte;
4568     coding->produced_char = inserted;
4569   }
4570
4571   return 0;
4572 }
4573
4574 Lisp_Object
4575 code_convert_string (str, coding, encodep, nocopy)
4576      Lisp_Object str;
4577      struct coding_system *coding;
4578      int encodep, nocopy;
4579 {
4580   int len;
4581   char *buf;
4582   int from = 0, to = XSTRING (str)->size;
4583   int to_byte = STRING_BYTES (XSTRING (str));
4584   struct gcpro gcpro1;
4585   Lisp_Object saved_coding_symbol;
4586   int result;
4587
4588   saved_coding_symbol = Qnil;
4589   if (encodep && !NILP (coding->pre_write_conversion)
4590       || !encodep && !NILP (coding->post_read_conversion))
4591     {
4592       /* Since we have to call Lisp functions which assume target text
4593          is in a buffer, after setting a temporary buffer, call
4594          code_convert_region.  */
4595       int count = specpdl_ptr - specpdl;
4596       struct buffer *prev = current_buffer;
4597
4598       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4599       temp_output_buffer_setup (" *code-converting-work*");
4600       set_buffer_internal (XBUFFER (Vstandard_output));
4601       if (encodep)
4602         insert_from_string (str, 0, 0, to, to_byte, 0);
4603       else
4604         {
4605           /* We must insert the contents of STR as is without
4606              unibyte<->multibyte conversion.  */
4607           current_buffer->enable_multibyte_characters = Qnil;
4608           insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4609           current_buffer->enable_multibyte_characters = Qt;
4610         }
4611       code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4612       if (encodep)
4613         /* We must return the buffer contents as unibyte string.  */
4614         current_buffer->enable_multibyte_characters = Qnil;
4615       str = make_buffer_string (BEGV, ZV, 0);
4616       set_buffer_internal (prev);
4617       return unbind_to (count, str);
4618     }
4619
4620   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4621     {
4622       /* See the comments in code_convert_region.  */
4623       if (coding->type == coding_type_undecided)
4624         {
4625           detect_coding (coding, XSTRING (str)->data, to_byte);
4626           if (coding->type == coding_type_undecided)
4627             coding->type = coding_type_emacs_mule;
4628         }
4629       if (coding->eol_type == CODING_EOL_UNDECIDED)
4630         {
4631           saved_coding_symbol = coding->symbol;
4632           detect_eol (coding, XSTRING (str)->data, to_byte);
4633           if (coding->eol_type == CODING_EOL_UNDECIDED)
4634             coding->eol_type = CODING_EOL_LF;
4635           /* We had better recover the original eol format if we
4636              encounter an inconsitent eol format while decoding.  */
4637           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4638         }
4639     }
4640
4641   if (encodep
4642       ? ! CODING_REQUIRE_ENCODING (coding)
4643       : ! CODING_REQUIRE_DECODING (coding))
4644     from = to_byte;
4645   else
4646     {
4647       /* Try to skip the heading and tailing ASCIIs.  */
4648       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4649                                 encodep);
4650     }
4651   if (from == to_byte
4652       && coding->type != coding_type_ccl)
4653     return (nocopy ? str : Fcopy_sequence (str));
4654
4655   if (encodep)
4656     len = encoding_buffer_size (coding, to_byte - from);
4657   else
4658     len = decoding_buffer_size (coding, to_byte - from);
4659   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
4660   GCPRO1 (str);
4661   buf = get_conversion_buffer (len);
4662   UNGCPRO;
4663
4664   if (from > 0)
4665     bcopy (XSTRING (str)->data, buf, from);
4666   result = (encodep
4667             ? encode_coding (coding, XSTRING (str)->data + from,
4668                              buf + from, to_byte - from, len)
4669             : decode_coding (coding, XSTRING (str)->data + from,
4670                              buf + from, to_byte - from, len));
4671   if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4672     {
4673       /* We simple try to decode the whole string again but without
4674          eol-conversion this time.  */
4675       coding->eol_type = CODING_EOL_LF;
4676       coding->symbol = saved_coding_symbol;
4677       return code_convert_string (str, coding, encodep, nocopy);
4678     }
4679
4680   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4681          STRING_BYTES (XSTRING (str)) - to_byte);
4682
4683   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
4684   if (encodep)
4685     str = make_unibyte_string (buf, len + coding->produced);
4686   else
4687     {
4688       int chars= (coding->fake_multibyte
4689                   ? multibyte_chars_in_text (buf + from, coding->produced)
4690                   : coding->produced_char);
4691       str = make_multibyte_string (buf, len + chars, len + coding->produced);
4692     }
4693
4694   return str;
4695 }
4696
4697 \f
4698 #ifdef emacs
4699 /*** 8. Emacs Lisp library functions ***/
4700
4701 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4702   "Return t if OBJECT is nil or a coding-system.\n\
4703 See the documentation of `make-coding-system' for information\n\
4704 about coding-system objects.")
4705   (obj)
4706      Lisp_Object obj;
4707 {
4708   if (NILP (obj))
4709     return Qt;
4710   if (!SYMBOLP (obj))
4711     return Qnil;
4712   /* Get coding-spec vector for OBJ.  */
4713   obj = Fget (obj, Qcoding_system);
4714   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4715           ? Qt : Qnil);
4716 }
4717
4718 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4719        Sread_non_nil_coding_system, 1, 1, 0,
4720   "Read a coding system from the minibuffer, prompting with string PROMPT.")
4721   (prompt)
4722      Lisp_Object prompt;
4723 {
4724   Lisp_Object val;
4725   do
4726     {
4727       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4728                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4729     }
4730   while (XSTRING (val)->size == 0);
4731   return (Fintern (val, Qnil));
4732 }
4733
4734 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4735   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4736 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4737   (prompt, default_coding_system)
4738      Lisp_Object prompt, default_coding_system;
4739 {
4740   Lisp_Object val;
4741   if (SYMBOLP (default_coding_system))
4742     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4743   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4744                           Qt, Qnil, Qcoding_system_history,
4745                           default_coding_system, Qnil);
4746   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4747 }
4748
4749 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4750        1, 1, 0,
4751   "Check validity of CODING-SYSTEM.\n\
4752 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4753 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4754 The value of property should be a vector of length 5.")
4755   (coding_system)
4756      Lisp_Object coding_system;
4757 {
4758   CHECK_SYMBOL (coding_system, 0);
4759   if (!NILP (Fcoding_system_p (coding_system)))
4760     return coding_system;
4761   while (1)
4762     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4763 }
4764 \f
4765 Lisp_Object
4766 detect_coding_system (src, src_bytes, highest)
4767      unsigned char *src;
4768      int src_bytes, highest;
4769 {
4770   int coding_mask, eol_type;
4771   Lisp_Object val, tmp;
4772   int dummy;
4773
4774   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4775   eol_type  = detect_eol_type (src, src_bytes, &dummy);
4776   if (eol_type == CODING_EOL_INCONSISTENT)
4777     eol_type = CODING_EOL_UNDECIDED;
4778
4779   if (!coding_mask)
4780     {
4781       val = Qundecided;
4782       if (eol_type != CODING_EOL_UNDECIDED)
4783         {
4784           Lisp_Object val2;
4785           val2 = Fget (Qundecided, Qeol_type);
4786           if (VECTORP (val2))
4787             val = XVECTOR (val2)->contents[eol_type];
4788         }
4789       return (highest ? val : Fcons (val, Qnil));
4790     }
4791
4792   /* At first, gather possible coding systems in VAL.  */
4793   val = Qnil;
4794   for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4795     {
4796       int idx
4797         = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4798       if (coding_mask & (1 << idx))
4799         {
4800           val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4801           if (highest)
4802             break;
4803         }
4804     }
4805   if (!highest)
4806     val = Fnreverse (val);
4807
4808   /* Then, replace the elements with subsidiary coding systems.  */
4809   for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4810     {
4811       if (eol_type != CODING_EOL_UNDECIDED
4812           && eol_type != CODING_EOL_INCONSISTENT)
4813         {
4814           Lisp_Object eol;
4815           eol = Fget (XCONS (tmp)->car, Qeol_type);
4816           if (VECTORP (eol))
4817             XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4818         }
4819     }
4820   return (highest ? XCONS (val)->car : val);
4821 }
4822
4823 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4824        2, 3, 0,
4825   "Detect coding system of the text in the region between START and END.\n\
4826 Return a list of possible coding systems ordered by priority.\n\
4827 \n\
4828 If only ASCII characters are found, it returns a list of single element\n\
4829 `undecided' or its subsidiary coding system according to a detected\n\
4830 end-of-line format.\n\
4831 \n\
4832 If optional argument HIGHEST is non-nil, return the coding system of\n\
4833 highest priority.")
4834   (start, end, highest)
4835      Lisp_Object start, end, highest;
4836 {
4837   int from, to;
4838   int from_byte, to_byte;
4839
4840   CHECK_NUMBER_COERCE_MARKER (start, 0);
4841   CHECK_NUMBER_COERCE_MARKER (end, 1);
4842
4843   validate_region (&start, &end);
4844   from = XINT (start), to = XINT (end);
4845   from_byte = CHAR_TO_BYTE (from);
4846   to_byte = CHAR_TO_BYTE (to);
4847
4848   if (from < GPT && to >= GPT)
4849     move_gap_both (to, to_byte);
4850
4851   return detect_coding_system (BYTE_POS_ADDR (from_byte),
4852                                to_byte - from_byte,
4853                                !NILP (highest));
4854 }
4855
4856 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4857        1, 2, 0,
4858   "Detect coding system of the text in STRING.\n\
4859 Return a list of possible coding systems ordered by priority.\n\
4860 \n\
4861 If only ASCII characters are found, it returns a list of single element\n\
4862 `undecided' or its subsidiary coding system according to a detected\n\
4863 end-of-line format.\n\
4864 \n\
4865 If optional argument HIGHEST is non-nil, return the coding system of\n\
4866 highest priority.")
4867   (string, highest)
4868      Lisp_Object string, highest;
4869 {
4870   CHECK_STRING (string, 0);
4871
4872   return detect_coding_system (XSTRING (string)->data,
4873                                STRING_BYTES (XSTRING (string)),
4874                                !NILP (highest));
4875 }
4876
4877 Lisp_Object
4878 code_convert_region1 (start, end, coding_system, encodep)
4879      Lisp_Object start, end, coding_system;
4880      int encodep;
4881 {
4882   struct coding_system coding;
4883   int from, to, len;
4884
4885   CHECK_NUMBER_COERCE_MARKER (start, 0);
4886   CHECK_NUMBER_COERCE_MARKER (end, 1);
4887   CHECK_SYMBOL (coding_system, 2);
4888
4889   validate_region (&start, &end);
4890   from = XFASTINT (start);
4891   to = XFASTINT (end);
4892
4893   if (NILP (coding_system))
4894     return make_number (to - from);
4895
4896   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4897     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4898
4899   coding.mode |= CODING_MODE_LAST_BLOCK;
4900   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4901                        &coding, encodep, 1);
4902   Vlast_coding_system_used = coding.symbol;
4903   return make_number (coding.produced_char);
4904 }
4905
4906 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4907        3, 3, "r\nzCoding system: ",
4908   "Decode the current region by specified coding system.\n\
4909 When called from a program, takes three arguments:\n\
4910 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4911 This function sets `last-coding-system-used' to the precise coding system\n\
4912 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4913 not fully specified.)\n\
4914 It returns the length of the decoded text.")
4915   (start, end, coding_system)
4916      Lisp_Object start, end, coding_system;
4917 {
4918   return code_convert_region1 (start, end, coding_system, 0);
4919 }
4920
4921 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4922        3, 3, "r\nzCoding system: ",
4923   "Encode the current region by specified coding system.\n\
4924 When called from a program, takes three arguments:\n\
4925 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4926 This function sets `last-coding-system-used' to the precise coding system\n\
4927 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4928 not fully specified.)\n\
4929 It returns the length of the encoded text.")
4930   (start, end, coding_system)
4931      Lisp_Object start, end, coding_system;
4932 {
4933   return code_convert_region1 (start, end, coding_system, 1);
4934 }
4935
4936 Lisp_Object
4937 code_convert_string1 (string, coding_system, nocopy, encodep)
4938      Lisp_Object string, coding_system, nocopy;
4939      int encodep;
4940 {
4941   struct coding_system coding;
4942
4943   CHECK_STRING (string, 0);
4944   CHECK_SYMBOL (coding_system, 1);
4945
4946   if (NILP (coding_system))
4947     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4948
4949   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4950     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4951
4952   coding.mode |= CODING_MODE_LAST_BLOCK;
4953   Vlast_coding_system_used = coding.symbol;
4954   return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4955 }
4956
4957 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
4958        2, 3, 0,
4959   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4960 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4961 if the decoding operation is trivial.\n\
4962 This function sets `last-coding-system-used' to the precise coding system\n\
4963 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4964 not fully specified.)")
4965   (string, coding_system, nocopy)
4966      Lisp_Object string, coding_system, nocopy;
4967 {
4968   return code_convert_string1 (string, coding_system, nocopy, 0);
4969 }
4970
4971 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
4972        2, 3, 0,
4973   "Encode STRING to CODING-SYSTEM, and return the result.\n\
4974 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4975 if the encoding operation is trivial.\n\
4976 This function sets `last-coding-system-used' to the precise coding system\n\
4977 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4978 not fully specified.)")
4979   (string, coding_system, nocopy)
4980      Lisp_Object string, coding_system, nocopy;
4981 {
4982   return code_convert_string1 (string, coding_system, nocopy, 1);
4983 }
4984
4985 /* Encode or decode STRING according to CODING_SYSTEM.
4986    Do not set Vlast_coding_system_used.  */
4987
4988 Lisp_Object
4989 code_convert_string_norecord (string, coding_system, encodep)
4990      Lisp_Object string, coding_system;
4991      int encodep;
4992 {
4993   struct coding_system coding;
4994
4995   CHECK_STRING (string, 0);
4996   CHECK_SYMBOL (coding_system, 1);
4997
4998   if (NILP (coding_system))
4999     return string;
5000
5001   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5002     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5003
5004   coding.mode |= CODING_MODE_LAST_BLOCK;
5005   return code_convert_string (string, &coding, encodep, Qt);
5006 }
5007 \f
5008 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5009   "Decode a JISX0208 character of shift-jis encoding.\n\
5010 CODE is the character code in SJIS.\n\
5011 Return the corresponding character.")
5012   (code)
5013      Lisp_Object code;
5014 {
5015   unsigned char c1, c2, s1, s2;
5016   Lisp_Object val;
5017
5018   CHECK_NUMBER (code, 0);
5019   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5020   DECODE_SJIS (s1, s2, c1, c2);
5021   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5022   return val;
5023 }
5024
5025 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5026   "Encode a JISX0208 character CHAR to SJIS coding system.\n\
5027 Return the corresponding character code in SJIS.")
5028   (ch)
5029      Lisp_Object ch;
5030 {
5031   int charset, c1, c2, s1, s2;
5032   Lisp_Object val;
5033
5034   CHECK_NUMBER (ch, 0);
5035   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5036   if (charset == charset_jisx0208)
5037     {
5038       ENCODE_SJIS (c1, c2, s1, s2);
5039       XSETFASTINT (val, (s1 << 8) | s2);
5040     }
5041   else
5042     XSETFASTINT (val, 0);
5043   return val;
5044 }
5045
5046 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5047   "Decode a Big5 character CODE of BIG5 coding system.\n\
5048 CODE is the character code in BIG5.\n\
5049 Return the corresponding character.")
5050   (code)
5051      Lisp_Object code;
5052 {
5053   int charset;
5054   unsigned char b1, b2, c1, c2;
5055   Lisp_Object val;
5056
5057   CHECK_NUMBER (code, 0);
5058   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5059   DECODE_BIG5 (b1, b2, charset, c1, c2);
5060   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5061   return val;
5062 }
5063
5064 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5065   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5066 Return the corresponding character code in Big5.")
5067   (ch)
5068      Lisp_Object ch;
5069 {
5070   int charset, c1, c2, b1, b2;
5071   Lisp_Object val;
5072
5073   CHECK_NUMBER (ch, 0);
5074   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5075   if (charset == charset_big5_1 || charset == charset_big5_2)
5076     {
5077       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5078       XSETFASTINT (val, (b1 << 8) | b2);
5079     }
5080   else
5081     XSETFASTINT (val, 0);
5082   return val;
5083 }
5084 \f
5085 DEFUN ("set-terminal-coding-system-internal",
5086        Fset_terminal_coding_system_internal,
5087        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5088   (coding_system)
5089      Lisp_Object coding_system;
5090 {
5091   CHECK_SYMBOL (coding_system, 0);
5092   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5093   /* We had better not send unsafe characters to terminal.  */
5094   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5095
5096   return Qnil;
5097 }
5098
5099 DEFUN ("set-safe-terminal-coding-system-internal",
5100        Fset_safe_terminal_coding_system_internal,
5101        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5102   (coding_system)
5103      Lisp_Object coding_system;
5104 {
5105   CHECK_SYMBOL (coding_system, 0);
5106   setup_coding_system (Fcheck_coding_system (coding_system),
5107                        &safe_terminal_coding);
5108   return Qnil;
5109 }
5110
5111 DEFUN ("terminal-coding-system",
5112        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5113   "Return coding system specified for terminal output.")
5114   ()
5115 {
5116   return terminal_coding.symbol;
5117 }
5118
5119 DEFUN ("set-keyboard-coding-system-internal",
5120        Fset_keyboard_coding_system_internal,
5121        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5122   (coding_system)
5123      Lisp_Object coding_system;
5124 {
5125   CHECK_SYMBOL (coding_system, 0);
5126   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5127   return Qnil;
5128 }
5129
5130 DEFUN ("keyboard-coding-system",
5131        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5132   "Return coding system specified for decoding keyboard input.")
5133   ()
5134 {
5135   return keyboard_coding.symbol;
5136 }
5137
5138 \f
5139 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5140        Sfind_operation_coding_system,  1, MANY, 0,
5141   "Choose a coding system for an operation based on the target name.\n\
5142 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5143 DECODING-SYSTEM is the coding system to use for decoding\n\
5144 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5145 for encoding (in case OPERATION does encoding).\n\
5146 \n\
5147 The first argument OPERATION specifies an I/O primitive:\n\
5148   For file I/O, `insert-file-contents' or `write-region'.\n\
5149   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5150   For network I/O, `open-network-stream'.\n\
5151 \n\
5152 The remaining arguments should be the same arguments that were passed\n\
5153 to the primitive.  Depending on which primitive, one of those arguments\n\
5154 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5155 whichever argument specifies the file name is TARGET.\n\
5156 \n\
5157 TARGET has a meaning which depends on OPERATION:\n\
5158   For file I/O, TARGET is a file name.\n\
5159   For process I/O, TARGET is a process name.\n\
5160   For network I/O, TARGET is a service name or a port number\n\
5161 \n\
5162 This function looks up what specified for TARGET in,\n\
5163 `file-coding-system-alist', `process-coding-system-alist',\n\
5164 or `network-coding-system-alist' depending on OPERATION.\n\
5165 They may specify a coding system, a cons of coding systems,\n\
5166 or a function symbol to call.\n\
5167 In the last case, we call the function with one argument,\n\
5168 which is a list of all the arguments given to this function.")
5169   (nargs, args)
5170      int nargs;
5171      Lisp_Object *args;
5172 {
5173   Lisp_Object operation, target_idx, target, val;
5174   register Lisp_Object chain;
5175
5176   if (nargs < 2)
5177     error ("Too few arguments");
5178   operation = args[0];
5179   if (!SYMBOLP (operation)
5180       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5181     error ("Invalid first arguement");
5182   if (nargs < 1 + XINT (target_idx))
5183     error ("Too few arguments for operation: %s",
5184            XSYMBOL (operation)->name->data);
5185   target = args[XINT (target_idx) + 1];
5186   if (!(STRINGP (target)
5187         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5188     error ("Invalid %dth argument", XINT (target_idx) + 1);
5189
5190   chain = ((EQ (operation, Qinsert_file_contents)
5191             || EQ (operation, Qwrite_region))
5192            ? Vfile_coding_system_alist
5193            : (EQ (operation, Qopen_network_stream)
5194               ? Vnetwork_coding_system_alist
5195               : Vprocess_coding_system_alist));
5196   if (NILP (chain))
5197     return Qnil;
5198
5199   for (; CONSP (chain); chain = XCONS (chain)->cdr)
5200     {
5201       Lisp_Object elt;
5202       elt = XCONS (chain)->car;
5203
5204       if (CONSP (elt)
5205           && ((STRINGP (target)
5206                && STRINGP (XCONS (elt)->car)
5207                && fast_string_match (XCONS (elt)->car, target) >= 0)
5208               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
5209         {
5210           val = XCONS (elt)->cdr;
5211           /* Here, if VAL is both a valid coding system and a valid
5212              function symbol, we return VAL as a coding system.  */
5213           if (CONSP (val))
5214             return val;
5215           if (! SYMBOLP (val))
5216             return Qnil;
5217           if (! NILP (Fcoding_system_p (val)))
5218             return Fcons (val, val);
5219           if (! NILP (Ffboundp (val)))
5220             {
5221               val = call1 (val, Flist (nargs, args));
5222               if (CONSP (val))
5223                 return val;
5224               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5225                 return Fcons (val, val);
5226             }
5227           return Qnil;
5228         }
5229     }
5230   return Qnil;
5231 }
5232
5233 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5234        Supdate_coding_systems_internal, 0, 0, 0,
5235   "Update internal database for ISO2022 and CCL based coding systems.\n\
5236 When values of the following coding categories are changed, you must\n\
5237 call this function:\n\
5238   coding-category-iso-7, coding-category-iso-7-tight,\n\
5239   coding-category-iso-8-1, coding-category-iso-8-2,\n\
5240   coding-category-iso-7-else, coding-category-iso-8-else,\n\
5241   coding-category-ccl")
5242   ()
5243 {
5244   int i;
5245
5246   for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
5247     {
5248       Lisp_Object val;
5249
5250       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5251       if (!NILP (val))
5252         {
5253           if (! coding_system_table[i])
5254             coding_system_table[i] = ((struct coding_system *)
5255                                       xmalloc (sizeof (struct coding_system)));
5256           setup_coding_system (val, coding_system_table[i]);
5257         }
5258       else if (coding_system_table[i])
5259         {
5260           xfree (coding_system_table[i]);
5261           coding_system_table[i] = NULL;
5262         }
5263     }
5264
5265   return Qnil;
5266 }
5267
5268 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5269        Sset_coding_priority_internal, 0, 0, 0,
5270   "Update internal database for the current value of `coding-category-list'.\n\
5271 This function is internal use only.")
5272   ()
5273 {
5274   int i = 0, idx;
5275   Lisp_Object val;
5276
5277   val = Vcoding_category_list;
5278
5279   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5280     {
5281       if (! SYMBOLP (XCONS (val)->car))
5282         break;
5283       idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5284       if (idx >= CODING_CATEGORY_IDX_MAX)
5285         break;
5286       coding_priorities[i++] = (1 << idx);
5287       val = XCONS (val)->cdr;
5288     }
5289   /* If coding-category-list is valid and contains all coding
5290      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5291      the following code saves Emacs from craching.  */
5292   while (i < CODING_CATEGORY_IDX_MAX)
5293     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5294
5295   return Qnil;
5296 }
5297
5298 #endif /* emacs */
5299
5300 \f
5301 /*** 9. Post-amble ***/
5302
5303 void
5304 init_coding ()
5305 {
5306   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5307 }
5308
5309 void
5310 init_coding_once ()
5311 {
5312   int i;
5313
5314   /* Emacs' internal format specific initialize routine.  */
5315   for (i = 0; i <= 0x20; i++)
5316     emacs_code_class[i] = EMACS_control_code;
5317   emacs_code_class[0x0A] = EMACS_linefeed_code;
5318   emacs_code_class[0x0D] = EMACS_carriage_return_code;
5319   for (i = 0x21 ; i < 0x7F; i++)
5320     emacs_code_class[i] = EMACS_ascii_code;
5321   emacs_code_class[0x7F] = EMACS_control_code;
5322   emacs_code_class[0x80] = EMACS_leading_code_composition;
5323   for (i = 0x81; i < 0xFF; i++)
5324     emacs_code_class[i] = EMACS_invalid_code;
5325   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5326   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5327   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5328   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5329
5330   /* ISO2022 specific initialize routine.  */
5331   for (i = 0; i < 0x20; i++)
5332     iso_code_class[i] = ISO_control_code;
5333   for (i = 0x21; i < 0x7F; i++)
5334     iso_code_class[i] = ISO_graphic_plane_0;
5335   for (i = 0x80; i < 0xA0; i++)
5336     iso_code_class[i] = ISO_control_code;
5337   for (i = 0xA1; i < 0xFF; i++)
5338     iso_code_class[i] = ISO_graphic_plane_1;
5339   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5340   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5341   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5342   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5343   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5344   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5345   iso_code_class[ISO_CODE_ESC] = ISO_escape;
5346   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5347   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5348   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5349
5350   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5351
5352   setup_coding_system (Qnil, &keyboard_coding);
5353   setup_coding_system (Qnil, &terminal_coding);
5354   setup_coding_system (Qnil, &safe_terminal_coding);
5355   setup_coding_system (Qnil, &default_buffer_file_coding);
5356
5357   bzero (coding_system_table, sizeof coding_system_table);
5358
5359   bzero (ascii_skip_code, sizeof ascii_skip_code);
5360   for (i = 0; i < 128; i++)
5361     ascii_skip_code[i] = 1;
5362
5363 #if defined (MSDOS) || defined (WINDOWSNT)
5364   system_eol_type = CODING_EOL_CRLF;
5365 #else
5366   system_eol_type = CODING_EOL_LF;
5367 #endif
5368 }
5369
5370 #ifdef emacs
5371
5372 void
5373 syms_of_coding ()
5374 {
5375   Qtarget_idx = intern ("target-idx");
5376   staticpro (&Qtarget_idx);
5377
5378   Qcoding_system_history = intern ("coding-system-history");
5379   staticpro (&Qcoding_system_history);
5380   Fset (Qcoding_system_history, Qnil);
5381
5382   /* Target FILENAME is the first argument.  */
5383   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5384   /* Target FILENAME is the third argument.  */
5385   Fput (Qwrite_region, Qtarget_idx, make_number (2));
5386
5387   Qcall_process = intern ("call-process");
5388   staticpro (&Qcall_process);
5389   /* Target PROGRAM is the first argument.  */
5390   Fput (Qcall_process, Qtarget_idx, make_number (0));
5391
5392   Qcall_process_region = intern ("call-process-region");
5393   staticpro (&Qcall_process_region);
5394   /* Target PROGRAM is the third argument.  */
5395   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5396
5397   Qstart_process = intern ("start-process");
5398   staticpro (&Qstart_process);
5399   /* Target PROGRAM is the third argument.  */
5400   Fput (Qstart_process, Qtarget_idx, make_number (2));
5401
5402   Qopen_network_stream = intern ("open-network-stream");
5403   staticpro (&Qopen_network_stream);
5404   /* Target SERVICE is the fourth argument.  */
5405   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5406
5407   Qcoding_system = intern ("coding-system");
5408   staticpro (&Qcoding_system);
5409
5410   Qeol_type = intern ("eol-type");
5411   staticpro (&Qeol_type);
5412
5413   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5414   staticpro (&Qbuffer_file_coding_system);
5415
5416   Qpost_read_conversion = intern ("post-read-conversion");
5417   staticpro (&Qpost_read_conversion);
5418
5419   Qpre_write_conversion = intern ("pre-write-conversion");
5420   staticpro (&Qpre_write_conversion);
5421
5422   Qno_conversion = intern ("no-conversion");
5423   staticpro (&Qno_conversion);
5424
5425   Qundecided = intern ("undecided");
5426   staticpro (&Qundecided);
5427
5428   Qcoding_system_p = intern ("coding-system-p");
5429   staticpro (&Qcoding_system_p);
5430
5431   Qcoding_system_error = intern ("coding-system-error");
5432   staticpro (&Qcoding_system_error);
5433
5434   Fput (Qcoding_system_error, Qerror_conditions,
5435         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5436   Fput (Qcoding_system_error, Qerror_message,
5437         build_string ("Invalid coding system"));
5438
5439   Qcoding_category = intern ("coding-category");
5440   staticpro (&Qcoding_category);
5441   Qcoding_category_index = intern ("coding-category-index");
5442   staticpro (&Qcoding_category_index);
5443
5444   Vcoding_category_table
5445     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5446   staticpro (&Vcoding_category_table);
5447   {
5448     int i;
5449     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5450       {
5451         XVECTOR (Vcoding_category_table)->contents[i]
5452           = intern (coding_category_name[i]);
5453         Fput (XVECTOR (Vcoding_category_table)->contents[i],
5454               Qcoding_category_index, make_number (i));
5455       }
5456   }
5457
5458   Qtranslation_table = intern ("translation-table");
5459   staticpro (&Qtranslation_table);
5460   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
5461
5462   Qtranslation_table_id = intern ("translation-table-id");
5463   staticpro (&Qtranslation_table_id);
5464
5465   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5466   staticpro (&Qtranslation_table_for_decode);
5467
5468   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5469   staticpro (&Qtranslation_table_for_encode);
5470
5471   Qsafe_charsets = intern ("safe-charsets");
5472   staticpro (&Qsafe_charsets);
5473
5474   Qvalid_codes = intern ("valid-codes");
5475   staticpro (&Qvalid_codes);
5476
5477   Qemacs_mule = intern ("emacs-mule");
5478   staticpro (&Qemacs_mule);
5479
5480   Qraw_text = intern ("raw-text");
5481   staticpro (&Qraw_text);
5482
5483   defsubr (&Scoding_system_p);
5484   defsubr (&Sread_coding_system);
5485   defsubr (&Sread_non_nil_coding_system);
5486   defsubr (&Scheck_coding_system);
5487   defsubr (&Sdetect_coding_region);
5488   defsubr (&Sdetect_coding_string);
5489   defsubr (&Sdecode_coding_region);
5490   defsubr (&Sencode_coding_region);
5491   defsubr (&Sdecode_coding_string);
5492   defsubr (&Sencode_coding_string);
5493   defsubr (&Sdecode_sjis_char);
5494   defsubr (&Sencode_sjis_char);
5495   defsubr (&Sdecode_big5_char);
5496   defsubr (&Sencode_big5_char);
5497   defsubr (&Sset_terminal_coding_system_internal);
5498   defsubr (&Sset_safe_terminal_coding_system_internal);
5499   defsubr (&Sterminal_coding_system);
5500   defsubr (&Sset_keyboard_coding_system_internal);
5501   defsubr (&Skeyboard_coding_system);
5502   defsubr (&Sfind_operation_coding_system);
5503   defsubr (&Supdate_coding_systems_internal);
5504   defsubr (&Sset_coding_priority_internal);
5505
5506   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5507     "List of coding systems.\n\
5508 \n\
5509 Do not alter the value of this variable manually.  This variable should be\n\
5510 updated by the functions `make-coding-system' and\n\
5511 `define-coding-system-alias'.");
5512   Vcoding_system_list = Qnil;
5513
5514   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5515     "Alist of coding system names.\n\
5516 Each element is one element list of coding system name.\n\
5517 This variable is given to `completing-read' as TABLE argument.\n\
5518 \n\
5519 Do not alter the value of this variable manually.  This variable should be\n\
5520 updated by the functions `make-coding-system' and\n\
5521 `define-coding-system-alias'.");
5522   Vcoding_system_alist = Qnil;
5523
5524   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5525     "List of coding-categories (symbols) ordered by priority.");
5526   {
5527     int i;
5528
5529     Vcoding_category_list = Qnil;
5530     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5531       Vcoding_category_list
5532         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5533                  Vcoding_category_list);
5534   }
5535
5536   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5537     "Specify the coding system for read operations.\n\
5538 It is useful to bind this variable with `let', but do not set it globally.\n\
5539 If the value is a coding system, it is used for decoding on read operation.\n\
5540 If not, an appropriate element is used from one of the coding system alists:\n\
5541 There are three such tables, `file-coding-system-alist',\n\
5542 `process-coding-system-alist', and `network-coding-system-alist'.");
5543   Vcoding_system_for_read = Qnil;
5544
5545   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5546     "Specify the coding system for write operations.\n\
5547 It is useful to bind this variable with `let', but do not set it globally.\n\
5548 If the value is a coding system, it is used for encoding on write operation.\n\
5549 If not, an appropriate element is used from one of the coding system alists:\n\
5550 There are three such tables, `file-coding-system-alist',\n\
5551 `process-coding-system-alist', and `network-coding-system-alist'.");
5552   Vcoding_system_for_write = Qnil;
5553
5554   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5555     "Coding system used in the latest file or process I/O.");
5556   Vlast_coding_system_used = Qnil;
5557
5558   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5559     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5560   inhibit_eol_conversion = 0;
5561
5562   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5563     "Non-nil means process buffer inherits coding system of process output.\n\
5564 Bind it to t if the process output is to be treated as if it were a file\n\
5565 read from some filesystem.");
5566   inherit_process_coding_system = 0;
5567
5568   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5569     "Alist to decide a coding system to use for a file I/O operation.\n\
5570 The format is ((PATTERN . VAL) ...),\n\
5571 where PATTERN is a regular expression matching a file name,\n\
5572 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5573 If VAL is a coding system, it is used for both decoding and encoding\n\
5574 the file contents.\n\
5575 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5576 and the cdr part is used for encoding.\n\
5577 If VAL is a function symbol, the function must return a coding system\n\
5578 or a cons of coding systems which are used as above.\n\
5579 \n\
5580 See also the function `find-operation-coding-system'\n\
5581 and the variable `auto-coding-alist'.");
5582   Vfile_coding_system_alist = Qnil;
5583
5584   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5585     "Alist to decide a coding system to use for a process I/O operation.\n\
5586 The format is ((PATTERN . VAL) ...),\n\
5587 where PATTERN is a regular expression matching a program name,\n\
5588 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5589 If VAL is a coding system, it is used for both decoding what received\n\
5590 from the program and encoding what sent to the program.\n\
5591 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5592 and the cdr part is used for encoding.\n\
5593 If VAL is a function symbol, the function must return a coding system\n\
5594 or a cons of coding systems which are used as above.\n\
5595 \n\
5596 See also the function `find-operation-coding-system'.");
5597   Vprocess_coding_system_alist = Qnil;
5598
5599   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5600     "Alist to decide a coding system to use for a network I/O operation.\n\
5601 The format is ((PATTERN . VAL) ...),\n\
5602 where PATTERN is a regular expression matching a network service name\n\
5603 or is a port number to connect to,\n\
5604 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5605 If VAL is a coding system, it is used for both decoding what received\n\
5606 from the network stream and encoding what sent to the network stream.\n\
5607 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5608 and the cdr part is used for encoding.\n\
5609 If VAL is a function symbol, the function must return a coding system\n\
5610 or a cons of coding systems which are used as above.\n\
5611 \n\
5612 See also the function `find-operation-coding-system'.");
5613   Vnetwork_coding_system_alist = Qnil;
5614
5615   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
5616     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
5617   eol_mnemonic_unix = ':';
5618
5619   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
5620     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5621   eol_mnemonic_dos = '\\';
5622
5623   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
5624     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5625   eol_mnemonic_mac = '/';
5626
5627   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5628     "Mnemonic character indicating end-of-line format is not yet decided.");
5629   eol_mnemonic_undecided = ':';
5630
5631   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
5632     "*Non-nil enables character translation while encoding and decoding.");
5633   Venable_character_translation = Qt;
5634
5635   DEFVAR_LISP ("standard-translation-table-for-decode",
5636     &Vstandard_translation_table_for_decode,
5637     "Table for translating characters while decoding.");
5638   Vstandard_translation_table_for_decode = Qnil;
5639
5640   DEFVAR_LISP ("standard-translation-table-for-encode",
5641     &Vstandard_translation_table_for_encode,
5642     "Table for translationg characters while encoding.");
5643   Vstandard_translation_table_for_encode = Qnil;
5644
5645   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5646     "Alist of charsets vs revision numbers.\n\
5647 While encoding, if a charset (car part of an element) is found,\n\
5648 designate it with the escape sequence identifing revision (cdr part of the element).");
5649   Vcharset_revision_alist = Qnil;
5650
5651   DEFVAR_LISP ("default-process-coding-system",
5652                &Vdefault_process_coding_system,
5653     "Cons of coding systems used for process I/O by default.\n\
5654 The car part is used for decoding a process output,\n\
5655 the cdr part is used for encoding a text to be sent to a process.");
5656   Vdefault_process_coding_system = Qnil;
5657
5658   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5659     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5660 This is a vector of length 256.\n\
5661 If Nth element is non-nil, the existence of code N in a file\n\
5662 \(or output of subprocess) doesn't prevent it to be detected as\n\
5663 a coding system of ISO 2022 variant which has a flag\n\
5664 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5665 or reading output of a subprocess.\n\
5666 Only 128th through 159th elements has a meaning.");
5667   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5668
5669   DEFVAR_LISP ("select-safe-coding-system-function",
5670                &Vselect_safe_coding_system_function,
5671     "Function to call to select safe coding system for encoding a text.\n\
5672 \n\
5673 If set, this function is called to force a user to select a proper\n\
5674 coding system which can encode the text in the case that a default\n\
5675 coding system used in each operation can't encode the text.\n\
5676 \n\
5677 The default value is `select-safe-coding-system' (which see).");
5678   Vselect_safe_coding_system_function = Qnil;
5679
5680 }
5681
5682 #endif /* emacs */